def test_jamsframe_from_df(): df = pd.DataFrame(data=[[0.0, 1.0, 'a', 0.0], [1.0, 2.0, 'b', 0.0]], columns=['time', 'duration', 'value', 'confidence']) jf = jams.JamsFrame.from_dataframe(df) # 1. type check assert isinstance(jf, jams.JamsFrame) # 2. check field order eq_(list(jf.keys().values), jams.JamsFrame.fields()) # 3. check field types assert jf['time'].dtype == np.dtype('<m8[ns]') assert jf['duration'].dtype == np.dtype('<m8[ns]') # 4. Check the values eq_(list(jf['time']), list(pd.to_timedelta([0.0, 1.0], unit='s'))) eq_(list(jf['duration']), list(pd.to_timedelta([1.0, 2.0], unit='s'))) eq_(list(jf['value']), ['a', 'b']) eq_(list(jf['confidence']), [0.0, 0.0])
def _decode_datetime_with_pandas(flat_num_dates, units, calendar): if calendar not in _STANDARD_CALENDARS: raise OutOfBoundsDatetime( 'Cannot decode times from a non-standard calendar, {!r}, using ' 'pandas.'.format(calendar)) delta, ref_date = _unpack_netcdf_time_units(units) delta = _netcdf_to_numpy_timeunit(delta) try: ref_date = pd.Timestamp(ref_date) except ValueError: # ValueError is raised by pd.Timestamp for non-ISO timestamp # strings, in which case we fall back to using cftime raise OutOfBoundsDatetime # fixes: https://github.com/pydata/pandas/issues/14068 # these lines check if the the lowest or the highest value in dates # cause an OutOfBoundsDatetime (Overflow) error with warnings.catch_warnings(): warnings.filterwarnings('ignore', 'invalid value encountered', RuntimeWarning) pd.to_timedelta(flat_num_dates.min(), delta) + ref_date pd.to_timedelta(flat_num_dates.max(), delta) + ref_date # Cast input dates to integers of nanoseconds because `pd.to_datetime` # works much faster when dealing with integers # make _NS_PER_TIME_DELTA an array to ensure type upcasting flat_num_dates_ns_int = (flat_num_dates.astype(np.float64) * _NS_PER_TIME_DELTA[delta]).astype(np.int64) return (pd.to_timedelta(flat_num_dates_ns_int, 'ns') + ref_date).values
def test_timedelta_ops_scalar(self): # GH 6808 base = pd.to_datetime("20130101 09:01:12.123456") expected_add = pd.to_datetime("20130101 09:01:22.123456") expected_sub = pd.to_datetime("20130101 09:01:02.123456") for offset in [ pd.to_timedelta(10, unit="s"), timedelta(seconds=10), np.timedelta64(10, "s"), np.timedelta64(10000000000, "ns"), pd.offsets.Second(10), ]: result = base + offset self.assertEqual(result, expected_add) result = base - offset self.assertEqual(result, expected_sub) base = pd.to_datetime("20130102 09:01:12.123456") expected_add = pd.to_datetime("20130103 09:01:22.123456") expected_sub = pd.to_datetime("20130101 09:01:02.123456") for offset in [ pd.to_timedelta("1 day, 00:00:10"), pd.to_timedelta("1 days, 00:00:10"), timedelta(days=1, seconds=10), np.timedelta64(1, "D") + np.timedelta64(10, "s"), pd.offsets.Day() + pd.offsets.Second(10), ]: result = base + offset self.assertEqual(result, expected_add) result = base - offset self.assertEqual(result, expected_sub)
def show_overlap_aligned(pc, mc, start=0, length_or_end=100, **dvo_kwargs): """pc and mc are aligned dataframes start is either an integer (row number) or a string (datetime) length_or_end is either an integer (row numbers, or seconds) or a string (like '10 min', '5 s', or '12:45:43') dvo_kwargs are keyword arguments for utils.plotting.categorical.dummy_variable_overlaps """ if isinstance(start, str): start = _read_or_infer_datetime(start, pc.index) if isinstance(length_or_end, str): try: end = _read_or_infer_datetime(length_or_end, pc.index) except ValueError: end = start + pd.to_timedelta(length_or_end) else: end = start + pd.to_timedelta(length_or_end, unit='s') pc_sub = pc[start:end] mc_sub = mc[start:end] else: end = start + length_or_end pc_sub = pc.iloc[start:end] mc_sub = mc.iloc[start:end] kwargs = {'drop_empty_rows':False, 'drop_empty_cols':False,} kwargs.update(dvo_kwargs) plotcat.dummy_variable_overlaps(pc_sub, mc_sub, 'pc', 'mc', x_label='AlmNr', y_label='Time', **kwargs)
def test_val_to_num(): assert val_to_num('7') == 7 assert val_to_num('.7') == .7 assert val_to_num('0.7') == .7 assert val_to_num('07') == 7 assert val_to_num('0') == 0 assert val_to_num('00') == 0 assert val_to_num('-20') == -20 assert val_to_num(7) == 7 assert val_to_num(0.7) == 0.7 assert val_to_num(0) == 0 assert val_to_num('NOW') == 'NOW' assert val_to_num('now') == 'now' assert val_to_num('TODAY') == 'TODAY' assert val_to_num('') == '' assert val_to_num('2018-10-10') == pd.to_datetime('2018-10-10') assert val_to_num('2018-10-09') == pd.to_datetime('2018-10-09') assert val_to_num('2017-12') == pd.to_datetime('2017-12') assert val_to_num('5e+6') == 5e6 assert val_to_num('5e-6') == 5e-6 assert val_to_num('0xabc') == '0xabc' assert val_to_num('hello world') == 'hello world' # The following tests document an idiosyncrasy of val_to_num which is difficult # to avoid while timedeltas are supported. assert val_to_num('50+20') == pd.to_timedelta('50+20') assert val_to_num('50-20') == pd.to_timedelta('50-20')
def __init__(self, data=None, index=None, columns=None, dtype=None): '''Construct a new JamsFrame object. Parameters ---------- data Optional data for the new JamsFrame, in any format supported by `pandas.DataFrame.__init__`. Fields must be `['time', 'duration', 'value', 'confidence']`. `time` and `duration` fields must be floating point types, measured in seconds. index Optional index on `data`. columns dtype These parameters are ignored by JamsFrame, but are allowed for API compatibility with `pandas.DataFrame`. See Also -------- from_dict from_dataframe pandas.DataFrame.__init__ ''' super(JamsFrame, self).__init__(data=data, index=index, columns=self.fields()) self.time = pd.to_timedelta(self.time, unit='s') self.duration = pd.to_timedelta(self.duration, unit='s')
def readCSV(allFiles, saveCSV): """ Read CSV files that record piston dendrometer displacement """ dfs = dict() # create a blank dictionary for file_ in allFiles: # Key: filename; Value: pandas data frame dfs[file_] = pd.read_csv(file_, header=None, parse_dates={"Year" : [1]}) # Parse the year / day of year / time column to a single datetime64 index dfs[file_].index =(dfs[file_].Year + pd.to_timedelta(dfs[file_][2],unit='D') + pd.to_timedelta(dfs[file_][3]//100-1,unit='H') + #loggers on DLS; minus one hour to set to UTC-8 pd.to_timedelta(dfs[file_][3]%100,unit='m')) del dfs[file_]['Year'], dfs[file_][0], dfs[file_][2], dfs[file_][3] # Relabel columns with file name + column number + 1 # to match the campbell SE channel in the data logger for column in dfs[file_]: dfs[file_].rename(columns = {column : file_ + str(dfs[file_].columns.get_loc(column) + 1)}, inplace=True) # Merge each dataframe in the dictionary, by datetim stamp merge = functools.partial(pd.merge, left_index=True, right_index=True, how='outer') radius = functools.reduce(merge, dfs.values()) if saveCSV: radius.to_csv('Data\merged_radius.csv') return radius
def test_infer_timedelta_units(self): for deltas, expected in [ (pd.to_timedelta(['1 day', '2 days']), 'days'), (pd.to_timedelta(['1h', '1 day 1 hour']), 'hours'), (pd.to_timedelta(['1m', '2m', np.nan]), 'minutes'), (pd.to_timedelta(['1m3s', '1m4s']), 'seconds')]: assert expected == coding.times.infer_timedelta_units(deltas)
def from_dataframe(cls, frame): '''Convert a pandas DataFrame into a JamsFrame. Note: this operation is destructive, in that the input DataFrame will have its type and data altered. Parameters ---------- frame : pandas.DataFrame The input DataFrame. Must have the appropriate JamsFrame fields: 'time', 'duration', 'value', and 'confidence'. 'time' and 'duration' fields should be of type `float` and measured in seconds. Returns ------- jams_frame : JamsFrame The input `frame` modified to form a JamsFrame. See Also -------- from_dict ''' # Encode time properly frame.time = pd.to_timedelta(frame.time, unit='s') frame.duration = pd.to_timedelta(frame.duration, unit='s') # Properly order the columns frame = frame[cls.fields()] # Clobber the class attribute frame.__class__ = cls return frame
def test_timedelta_ops_scalar(self): # GH 6808 base = pd.to_datetime('20130101 09:01:12.123456') expected_add = pd.to_datetime('20130101 09:01:22.123456') expected_sub = pd.to_datetime('20130101 09:01:02.123456') for offset in [pd.to_timedelta(10, unit='s'), timedelta(seconds=10), np.timedelta64(10, 's'), np.timedelta64(10000000000, 'ns'), pd.offsets.Second(10)]: result = base + offset assert result == expected_add result = base - offset assert result == expected_sub base = pd.to_datetime('20130102 09:01:12.123456') expected_add = pd.to_datetime('20130103 09:01:22.123456') expected_sub = pd.to_datetime('20130101 09:01:02.123456') for offset in [pd.to_timedelta('1 day, 00:00:10'), pd.to_timedelta('1 days, 00:00:10'), timedelta(days=1, seconds=10), np.timedelta64(1, 'D') + np.timedelta64(10, 's'), pd.offsets.Day() + pd.offsets.Second(10)]: result = base + offset assert result == expected_add result = base - offset assert result == expected_sub
def deform_annotation(self, annotation, state): '''Deform the annotation''' track_duration = state['duration'] # Get the time in seconds t = np.asarray([x.total_seconds() for x in annotation.data.time]) if self.time: # Deform t += np.random.normal(loc=self.mean, scale=self.sigma, size=t.shape) # Clip to the track duration t = np.clip(t, 0, track_duration) annotation.data.time = pd.to_timedelta(t, unit='s') # Get the time in seconds d = np.asarray([x.total_seconds() for x in annotation.data.duration]) if self.duration: # Deform d += np.random.normal(loc=self.mean, scale=self.sigma, size=d.shape) # Clip to the track duration - interval start d = [np.clip(d_i, 0, track_duration - t_i) for (d_i, t_i) in zip(d, t)] annotation.data.duration = pd.to_timedelta(d, unit='s')
def test_nat_converters(self): assert to_timedelta('nat', box=False).astype('int64') == iNaT assert to_timedelta('nan', box=False).astype('int64') == iNaT def testit(unit, transform): # array result = to_timedelta(np.arange(5), unit=unit) expected = TimedeltaIndex([np.timedelta64(i, transform(unit)) for i in np.arange(5).tolist()]) tm.assert_index_equal(result, expected) # scalar result = to_timedelta(2, unit=unit) expected = Timedelta(np.timedelta64(2, transform(unit)).astype( 'timedelta64[ns]')) assert result == expected # validate all units # GH 6855 for unit in ['Y', 'M', 'W', 'D', 'y', 'w', 'd']: testit(unit, lambda x: x.upper()) for unit in ['days', 'day', 'Day', 'Days']: testit(unit, lambda x: 'D') for unit in ['h', 'm', 's', 'ms', 'us', 'ns', 'H', 'S', 'MS', 'US', 'NS']: testit(unit, lambda x: x.lower()) # offsets # m testit('T', lambda x: 'm') # ms testit('L', lambda x: 'ms')
def test_timedelta(self): converter = lambda x: pd.to_timedelta(x, unit="ms") s = Series([timedelta(23), timedelta(seconds=5)]) self.assertEqual(s.dtype, "timedelta64[ns]") # index will be float dtype assert_series_equal(s, pd.read_json(s.to_json(), typ="series").apply(converter), check_index_type=False) s = Series([timedelta(23), timedelta(seconds=5)], index=pd.Index([0, 1], dtype=float)) self.assertEqual(s.dtype, "timedelta64[ns]") assert_series_equal(s, pd.read_json(s.to_json(), typ="series").apply(converter)) frame = DataFrame([timedelta(23), timedelta(seconds=5)]) self.assertEqual(frame[0].dtype, "timedelta64[ns]") assert_frame_equal( frame, pd.read_json(frame.to_json()).apply(converter), check_index_type=False, check_column_type=False ) frame = DataFrame( { "a": [timedelta(days=23), timedelta(seconds=5)], "b": [1, 2], "c": pd.date_range(start="20130101", periods=2), } ) result = pd.read_json(frame.to_json(date_unit="ns")) result["a"] = pd.to_timedelta(result.a, unit="ns") result["c"] = pd.to_datetime(result.c) assert_frame_equal(frame, result, check_index_type=False)
def test_timedelta_ops(self): _skip_if_numpy_not_friendly() # GH4984 # make sure ops return timedeltas s = Series([Timestamp('20130101') + timedelta(seconds=i*i) for i in range(10) ]) td = s.diff() result = td.mean()[0] # TODO This should have returned a scalar to begin with. Hack for now. expected = to_timedelta(timedelta(seconds=9)) tm.assert_almost_equal(result, expected) result = td.quantile(.1) # This properly returned a scalar. expected = to_timedelta('00:00:02.6') tm.assert_almost_equal(result, expected) result = td.median()[0] # TODO This should have returned a scalar to begin with. Hack for now. expected = to_timedelta('00:00:08') tm.assert_almost_equal(result, expected) # GH 6462 # consistency in returned values for sum result = td.sum()[0] expected = to_timedelta('00:01:21') tm.assert_almost_equal(result, expected)
def test_timedelta_ops_scalar(self): _skip_if_numpy_not_friendly() # GH 6808 base = pd.to_datetime('20130101 09:01:12.123456') expected_add = pd.to_datetime('20130101 09:01:22.123456') expected_sub = pd.to_datetime('20130101 09:01:02.123456') for offset in [pd.to_timedelta(10,unit='s'), timedelta(seconds=10), np.timedelta64(10,'s'), np.timedelta64(10000000000,'ns'), pd.offsets.Second(10)]: result = base + offset self.assertEquals(result, expected_add) result = base - offset self.assertEquals(result, expected_sub) base = pd.to_datetime('20130102 09:01:12.123456') expected_add = pd.to_datetime('20130103 09:01:22.123456') expected_sub = pd.to_datetime('20130101 09:01:02.123456') for offset in [pd.to_timedelta('1 day, 00:00:10'), pd.to_timedelta('1 days, 00:00:10'), timedelta(days=1,seconds=10), np.timedelta64(1,'D')+np.timedelta64(10,'s'), pd.offsets.Day()+pd.offsets.Second(10)]: result = base + offset self.assertEquals(result, expected_add) result = base - offset self.assertEquals(result, expected_sub)
def test_infer_timedelta_units(self): for deltas, expected in [ (pd.to_timedelta(['1 day', '2 days']), 'days'), (pd.to_timedelta(['1h', '1 day 1 hour']), 'hours'), (pd.to_timedelta(['1m', '2m', np.nan]), 'minutes'), (pd.to_timedelta(['1m3s', '1m4s']), 'seconds')]: self.assertEqual(expected, conventions.infer_timedelta_units(deltas))
def test_timedelta(self): converter = lambda x: pd.to_timedelta(x, unit='ms') s = Series([timedelta(23), timedelta(seconds=5)]) self.assertEqual(s.dtype, 'timedelta64[ns]') result = pd.read_json(s.to_json(), typ='series').apply(converter) assert_series_equal(result, s) s = Series([timedelta(23), timedelta(seconds=5)], index=pd.Index([0, 1])) self.assertEqual(s.dtype, 'timedelta64[ns]') result = pd.read_json(s.to_json(), typ='series').apply(converter) assert_series_equal(result, s) frame = DataFrame([timedelta(23), timedelta(seconds=5)]) self.assertEqual(frame[0].dtype, 'timedelta64[ns]') assert_frame_equal(frame, pd.read_json(frame.to_json()) .apply(converter)) frame = DataFrame({'a': [timedelta(days=23), timedelta(seconds=5)], 'b': [1, 2], 'c': pd.date_range(start='20130101', periods=2)}) result = pd.read_json(frame.to_json(date_unit='ns')) result['a'] = pd.to_timedelta(result.a, unit='ns') result['c'] = pd.to_datetime(result.c) assert_frame_equal(frame, result)
def compute_sparseness(row,ds_sep): """ Computes the "sparseness" of a sparse row. There are three levels of sparseness: SPARSE1: SPARSITY_MIN <= row.mean_sep / ds_sep < SPARSITY_MID SPARSE2: SPARSITY_MID <= row.mean_sep / ds_sep < SPARSITY_MAX SPARSE3: SPARSITY_MAX <= row.mean_sep / ds_sep :param row: The dataset row containing the sparse data :param ds_sep: average separation of data points in the dataset :return: The appropriate "sparsness" level """ # calculate the row's data density ratio if 'mean_sep' in row: sep_ratio = row.mean_sep / pd.to_timedelta(ds_sep, 's') else: # calculate the density from the row interval = row.last - row.first mean_sep = interval / row.count sep_ratio = mean_sep / pd.to_timedelta(ds_sep, 's') # in case this method is called on a row that isn't sparse, default to having data present. ret_val = PRESENT if sep_ratio >= SPARSITY_MAX: ret_val = SPARSE3 elif sep_ratio >= SPARSITY_MID: ret_val = SPARSE2 elif sep_ratio >= SPARSITY_MIN: ret_val = SPARSE1 return ret_val
def ymd_to_dt(df, utc=True): return(pd.to_datetime(df["year"].astype(str) + "-" + df["month"].astype(str) + "-" + df["day"].astype(str), utc=utc)\ + pd.to_timedelta(df["hour"].astype(str) + "H")\ + pd.to_timedelta(df["minute"].astype(str) + "M")\ + pd.to_timedelta(df["second"].astype(str) + "S"))
def test_nat_converters(self): result = to_timedelta('nat', box=False) assert result.dtype.kind == 'm' assert result.astype('int64') == iNaT result = to_timedelta('nan', box=False) assert result.dtype.kind == 'm' assert result.astype('int64') == iNaT
def test_nat_converters(self): result = to_timedelta('nat').to_numpy() assert result.dtype.kind == 'M' assert result.astype('int64') == iNaT result = to_timedelta('nan').to_numpy() assert result.dtype.kind == 'M' assert result.astype('int64') == iNaT
def decode_cf_datetime(num_dates, units, calendar=None): """Given an array of numeric dates in netCDF format, convert it into a numpy array of date time objects. For standard (Gregorian) calendars, this function uses vectorized operations, which makes it much faster than cftime.num2date. In such a case, the returned array will be of type np.datetime64. Note that time unit in `units` must not be smaller than microseconds and not larger than days. See also -------- cftime.num2date """ num_dates = np.asarray(num_dates) flat_num_dates = num_dates.ravel() if calendar is None: calendar = 'standard' delta, ref_date = _unpack_netcdf_time_units(units) try: if calendar not in _STANDARD_CALENDARS: raise OutOfBoundsDatetime delta = _netcdf_to_numpy_timeunit(delta) try: ref_date = pd.Timestamp(ref_date) except ValueError: # ValueError is raised by pd.Timestamp for non-ISO timestamp # strings, in which case we fall back to using cftime raise OutOfBoundsDatetime # fixes: https://github.com/pydata/pandas/issues/14068 # these lines check if the the lowest or the highest value in dates # cause an OutOfBoundsDatetime (Overflow) error with warnings.catch_warnings(): warnings.filterwarnings('ignore', 'invalid value encountered', RuntimeWarning) pd.to_timedelta(flat_num_dates.min(), delta) + ref_date pd.to_timedelta(flat_num_dates.max(), delta) + ref_date # Cast input dates to integers of nanoseconds because `pd.to_datetime` # works much faster when dealing with integers # make _NS_PER_TIME_DELTA an array to ensure type upcasting flat_num_dates_ns_int = (flat_num_dates.astype(np.float64) * _NS_PER_TIME_DELTA[delta]).astype(np.int64) dates = (pd.to_timedelta(flat_num_dates_ns_int, 'ns') + ref_date).values except (OutOfBoundsDatetime, OverflowError): dates = _decode_datetime_with_cftime( flat_num_dates.astype(np.float), units, calendar) return dates.reshape(num_dates.shape)
def test_to_timedelta_box_deprecated(self): result = np.timedelta64(0, 'ns') # Deprecated - see GH24416 with tm.assert_produces_warning(FutureWarning): to_timedelta(0, box=False) expected = to_timedelta(0).to_timedelta64() assert result == expected
def test_timedelta_ops(self): # GH#4984 # make sure ops return Timedelta s = Series([Timestamp('20130101') + timedelta(seconds=i * i) for i in range(10)]) td = s.diff() result = td.mean() expected = to_timedelta(timedelta(seconds=9)) assert result == expected result = td.to_frame().mean() assert result[0] == expected result = td.quantile(.1) expected = Timedelta(np.timedelta64(2600, 'ms')) assert result == expected result = td.median() expected = to_timedelta('00:00:09') assert result == expected result = td.to_frame().median() assert result[0] == expected # GH#6462 # consistency in returned values for sum result = td.sum() expected = to_timedelta('00:01:21') assert result == expected result = td.to_frame().sum() assert result[0] == expected # std result = td.std() expected = to_timedelta(Series(td.dropna().values).std()) assert result == expected result = td.to_frame().std() assert result[0] == expected # invalid ops for op in ['skew', 'kurt', 'sem', 'prod']: msg = "reduction operation '{}' not allowed for this dtype" with pytest.raises(TypeError, match=msg.format(op)): getattr(td, op)() # GH#10040 # make sure NaT is properly handled by median() s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07')]) assert s.diff().median() == timedelta(days=4) s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07'), Timestamp('2015-02-15')]) assert s.diff().median() == timedelta(days=6)
def test_contains(self): # Checking for any NaT-like objects # GH 13603 td = to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) for v in [pd.NaT, None, float('nan'), np.nan]: assert not (v in td) td = to_timedelta([pd.NaT]) for v in [pd.NaT, None, float('nan'), np.nan]: assert (v in td)
def deform_times(ann, state): '''Deform time values for all annotations.''' ann.data.time = [pd.to_timedelta(x.total_seconds() / state['rate'], unit='s') for x in ann.data.time] ann.data.duration = [pd.to_timedelta(x.total_seconds() / state['rate'], unit='s') for x in ann.data.duration]
def get_timeseries(self, tail_n=-1): """ Convert the captured values into a pandas.Series with a TimeDeltaIndex. """ tail_n = min(len(self.values), 0) if tail_n>0: return pd.Series(self.values[:-tail_n], index=pd.to_timedelta(self.indices[:-tail_n], unit="s")) else: return pd.Series(self.values, index=pd.to_timedelta(self.indices, unit="s"))
def __test(data): ann = Annotation(namespace='onset') # Bypass the safety chceks in add_observation ann.data.loc[0] = {'time': pd.to_timedelta(data['time'], unit='s'), 'duration': pd.to_timedelta(data['duration'], unit='s'), 'value': None, 'confdence': None} ann.validate()
def test_timedelta_ops(self): # GH4984 # make sure ops return Timedelta s = Series([Timestamp('20130101') + timedelta(seconds=i * i) for i in range(10)]) td = s.diff() result = td.mean() expected = to_timedelta(timedelta(seconds=9)) self.assertEqual(result, expected) result = td.to_frame().mean() self.assertEqual(result[0], expected) result = td.quantile(.1) expected = Timedelta(np.timedelta64(2600, 'ms')) self.assertEqual(result, expected) result = td.median() expected = to_timedelta('00:00:09') self.assertEqual(result, expected) result = td.to_frame().median() self.assertEqual(result[0], expected) # GH 6462 # consistency in returned values for sum result = td.sum() expected = to_timedelta('00:01:21') self.assertEqual(result, expected) result = td.to_frame().sum() self.assertEqual(result[0], expected) # std result = td.std() expected = to_timedelta(Series(td.dropna().values).std()) self.assertEqual(result, expected) result = td.to_frame().std() self.assertEqual(result[0], expected) # invalid ops for op in ['skew', 'kurt', 'sem', 'prod']: self.assertRaises(TypeError, getattr(td, op)) # GH 10040 # make sure NaT is properly handled by median() s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07')]) self.assertEqual(s.diff().median(), timedelta(days=4)) s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07'), Timestamp('2015-02-15')]) self.assertEqual(s.diff().median(), timedelta(days=6))
def testit(unit, transform): # array result = to_timedelta(np.arange(5),unit=unit) expected = Series([ np.timedelta64(i,transform(unit)) for i in np.arange(5).tolist() ]) tm.assert_series_equal(result, expected) # scalar result = to_timedelta(2,unit=unit) expected = np.timedelta64(2,transform(unit)).astype('timedelta64[ns]') self.assert_numpy_array_equal(result,expected)
_, window, cutoff_date_str = sys.argv cutoff_date = pd.to_datetime(cutoff_date_str) # gsheet: Forecasting/Tix Targets # only on rolling forecasts fin = '~/Forecasts/rolling/par/adj_r_xls_' + window + '_' + cutoff_date_str + '.par' # input file f_df = pd.read_parquet(fin) # load last adjusted fcast if f_df is None: s_ut.my_print('ERROR: could not find ' + fin) sys.exit() # week_starting patch df_cols_ = f_df.columns if 'ds_week_ending' in df_cols_ and 'ds_week_starting' not in df_cols_: f_df['ds_week_ending'] = pd.to_datetime(f_df['ds_week_ending']) f_df['ds_week_starting'] = f_df['ds_week_ending'] - pd.to_timedelta( 6, unit='D') dates = [ str(pd.to_datetime(x).date()) for x in f_df['ds_week_starting'].unique() ] # China print('************** China ***************') cn_plan = prepare_plan('Initiatives - China.csv', cutoff_date, True) cn_df = adjust_fcast(cn_plan, f_df, 'China') # Homes print('************** Homes ***************') hm_plan = prepare_plan('Initiatives - Homes.csv', cutoff_date, False) print('tix: ' + str(f_df[
class TestDataFrameUnaryOperators(object): # __pos__, __neg__, __inv__ @pytest.mark.parametrize( 'df,expected', [(pd.DataFrame({'a': [-1, 1]}), pd.DataFrame({'a': [1, -1]})), (pd.DataFrame({'a': [False, True]}), pd.DataFrame( {'a': [True, False]})), (pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}), pd.DataFrame({'a': pd.Series(pd.to_timedelta([1, -1]))}))]) def test_neg_numeric(self, df, expected): assert_frame_equal(-df, expected) assert_series_equal(-df['a'], expected['a']) @pytest.mark.parametrize('df, expected', [ (np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)), ([Decimal('1.0'), Decimal('2.0')], [Decimal('-1.0'), Decimal('-2.0')]), ]) def test_neg_object(self, df, expected): # GH#21380 df = pd.DataFrame({'a': df}) expected = pd.DataFrame({'a': expected}) assert_frame_equal(-df, expected) assert_series_equal(-df['a'], expected['a']) @pytest.mark.parametrize('df', [ pd.DataFrame({'a': ['a', 'b']}), pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}), ]) def test_neg_raises(self, df): with pytest.raises(TypeError): (-df) with pytest.raises(TypeError): (-df['a']) def test_invert(self): _seriesd = tm.getSeriesData() df = pd.DataFrame(_seriesd) assert_frame_equal(-(df < 0), ~(df < 0)) @pytest.mark.parametrize('df', [ pd.DataFrame({'a': [-1, 1]}), pd.DataFrame({'a': [False, True]}), pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}), ]) def test_pos_numeric(self, df): # GH#16073 assert_frame_equal(+df, df) assert_series_equal(+df['a'], df['a']) @pytest.mark.parametrize( 'df', [ # numpy changing behavior in the future pytest.param(pd.DataFrame({'a': ['a', 'b']}), marks=[pytest.mark.filterwarnings("ignore")]), pd.DataFrame({'a': np.array([-1, 2], dtype=object)}), pd.DataFrame({'a': [Decimal('-1.0'), Decimal('2.0')]}), ]) def test_pos_object(self, df): # GH#21380 assert_frame_equal(+df, df) assert_series_equal(+df['a'], df['a']) @pytest.mark.parametrize('df', [ pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}), ]) def test_pos_raises(self, df): with pytest.raises(TypeError): (+df) with pytest.raises(TypeError): (+df['a'])
def to_Timedelta(days: int) -> _pd.Timedelta: return _pd.to_timedelta(str(days) + 'D')
], ), html.Div( id="info-container", className="six columns", children=[ html.Div( id="dropout1", className="six columns inner-row", style={'width': '35%'}, children=[ html.Div(children='''Период'''), dcc.DatePickerRange( id='date-picker-range', start_date=datetime.now() - pd.to_timedelta('2H'), display_format='DD-MM-YYYY', end_date=datetime.now() + pd.to_timedelta('0.5H'), ), ], ), html.Div( id="subject-dropout-box", className="six columns", style={'width': '20%'}, children=[ html.Div(children='''Сервис'''), dcc.Dropdown( id="subject-dropout", options=[{
data = data[['TowerLon', 'TowerLat']] from sklearn.cluster import KMeans model = KMeans(n_clusters=clusters) model.fit(data) return model # # TODO: Load up the dataset and take a peek at its head and dtypes. # Convert the date using pd.to_datetime, and the time using pd.to_timedelta # # .. your code here .. df = pd.read_csv('Datasets/CDR.csv') df['CallDate'] = pd.to_datetime(df['CallDate']) df['CallTime'] = pd.to_timedelta(df['CallTime']) # # TODO: Create a unique list of of the phone-number values (users) stored in the # "In" column of the dataset, and save it to a variable called `unique_numbers`. # Manually check through unique_numbers to ensure the order the numbers appear is # the same order they appear (uniquely) in your dataset: # # .. your code here .. In = df['In'] unique_numbers = In.unique() # # INFO: The locations map above should be too "busy" to really wrap your head around. This # is where domain expertise comes into play. Your intuition tells you that people are likely # to behave differently on weekends:
# Store the destination and starting cells chunk["DESTINATION"] = chunk.GRID_POLYLINE.map(lambda x: x[-1]) chunk["START_CELL"] = chunk.GRID_POLYLINE.map(lambda x: x[0]) # Loop through the cutoff dates. For every cutoff date: # 1. The active trips are removed from the chunk (training) set # 2. These trips are truncated and saved into the test set. # 3. Iteratively, the size of chunk is reduced until we have passed # through all the cutoff dates. # 4. This final remained forms the training set. for cutoff_date in cutoff_dates: # Allocate the inactive trips to the training set. Add 30 seconds for # the boundary cases (if time difference between cutoff_date and # TIMESTAMP is less than 30 seconds: not enough data to truncate active = ((chunk.TIMESTAMP + pd.to_timedelta(30, unit = 's')) <= cutoff_date) & ((chunk.TIMESTAMP + pd.to_timedelta(chunk.DURATION, unit = 's')) >= cutoff_date) # For the active trips, the trip is truncated at the cutoff time if np.sum(active) > 0: validation = chunk[active].reset_index(drop = True) # Compute elapsed time in seconds elapsed = np.abs((cutoff_date.astype(np.int64) - (validation.TIMESTAMP.astype(np.int64) // 10 ** 9))) # astype(np.int64) returns unix in nanoseconds! # Get the (integer) cutoff point from the elapsed time. (15 seconds between each measurement) validation["CUTOFF"] = np.floor(elapsed/15).astype(int) # Truncate the paths (UGLY WAY) validation["TRUNC_POLYLINE"] = None validation["TRUNC_GRID_POLYLINE"] = None
def main(loc_id, loc_name, output_version): print('Reading in short-term outcomes...') ## Read in short-term outcomes # region ------------------------------------------------------------------- # Durations and proportions dp = pd.read_csv( '{}WORK/12_bundle/covid/data/long_covid/long_covid_proportions_durations_with_overlaps.csv' .format(roots['j'])) # Mild/Moderate print(' mild/moderate...') midmod = Dataset(loc_id, loc_name, output_version, 'midmod', nf_type='long') # Hospital print(' hospital...') hospital = Dataset(loc_id, loc_name, output_version, 'hsp_admit', nf_type='long') # Icu print(' icu...') icu = Dataset(loc_id, loc_name, output_version, 'icu_admit', nf_type='long') # endregion ---------------------------------------------------------------- print('Calculating mild/moderate incidence & prevalence...') ## Mild/Moderate Incidence & Prevalence # region ------------------------------------------------------------------- # Shift hospitalizations 7 days lag_hsp = copy.deepcopy(hospital) lag_hsp.data = lag_hsp.data.drop(columns=['hospital_deaths']) lag_hsp.data.date = lag_hsp.data.date + pd.to_timedelta( roots['defaults']['symp_to_hsp_admit_duration'], unit='D') # Merge midmod and lag_hsp midmod.data = pd.merge( midmod.data, lag_hsp.data, how='left', on=['location_id', 'age_group_id', 'sex_id', 'draw_var', 'date']) del lag_hsp # mild/moderate at risk number = (mild/moderate incidence - hospital admissions|7 days later) | # shift forward by {incubation period + mild/moderate duration|no hospital} midmod.data[ 'midmod_risk_num'] = midmod.data.midmod_inc - midmod.data.hospital_inc midmod.data.date = midmod.data.date + pd.to_timedelta( (roots['defaults']['incubation_period'] + roots['defaults']['midmod_duration_no_hsp']), unit='D') # Calculate the incidence of each symptom and overlap, regardless of co-occurrence of additional symptoms (not mutually exclusive) # mild/moderate long-term incidence = mild/moderate number at risk * proportion of mild/moderate with each long-term symptom cluster midmod.data['midmod_cog_inc'] = ( midmod.data.midmod_risk_num * dp.loc[(dp.outcome == 'cognitive') & (dp.population == 'midmod'), 'proportion_mean'].values[0]) midmod.data['midmod_fat_inc'] = ( midmod.data.midmod_risk_num * dp.loc[(dp.outcome == 'fatigue') & (dp.population == 'midmod'), 'proportion_mean'].values[0]) midmod.data['midmod_resp_inc'] = ( midmod.data.midmod_risk_num * dp.loc[(dp.outcome == 'respiratory') & (dp.population == 'midmod'), 'proportion_mean'].values[0]) midmod.data['midmod_cog_fat_inc'] = ( midmod.data.midmod_risk_num * dp.loc[(dp.outcome == 'cognitive_fatigue') & (dp.population == 'midmod'), 'proportion_mean'].values[0]) midmod.data['midmod_cog_resp_inc'] = ( midmod.data.midmod_risk_num * dp.loc[(dp.outcome == 'cognitive_respiratory') & (dp.population == 'midmod'), 'proportion_mean'].values[0]) midmod.data['midmod_fat_resp_inc'] = ( midmod.data.midmod_risk_num * dp.loc[(dp.outcome == 'fatigue_respiratory') & (dp.population == 'midmod'), 'proportion_mean'].values[0]) midmod.data['midmod_cog_fat_resp_inc'] = ( midmod.data.midmod_risk_num * dp.loc[(dp.outcome == 'cognitive_fatigue_respiratory') & (dp.population == 'midmod'), 'proportion_mean'].values[0]) # Creating mutually exclusive categories of symptoms # cog_inc = cog_inc - (cog_fat_inc - cog_fat_resp_inc) - (cog_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc midmod.data.midmod_cog_inc = (midmod.data.midmod_cog_inc - (midmod.data.midmod_cog_fat_inc - midmod.data.midmod_cog_fat_resp_inc) - (midmod.data.midmod_cog_resp_inc - midmod.data.midmod_cog_fat_resp_inc) - midmod.data.midmod_cog_fat_resp_inc) # fat_inc = fat_inc - (cog_fat_inc - cog_fat_resp_inc) - (fat_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc midmod.data.midmod_fat_inc = (midmod.data.midmod_fat_inc - (midmod.data.midmod_cog_fat_inc - midmod.data.midmod_cog_fat_resp_inc) - (midmod.data.midmod_fat_resp_inc - midmod.data.midmod_cog_fat_resp_inc) - midmod.data.midmod_cog_fat_resp_inc) # resp_inc = resp_inc - (fat_resp_inc - cog_fat_resp_inc) - (cog_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc midmod.data.midmod_resp_inc = (midmod.data.midmod_resp_inc - (midmod.data.midmod_fat_resp_inc - midmod.data.midmod_cog_fat_resp_inc) - (midmod.data.midmod_cog_resp_inc - midmod.data.midmod_cog_fat_resp_inc) - midmod.data.midmod_cog_fat_resp_inc) # cog_fat_inc = cog_fat_inc - cog_fat_resp_inc midmod.data.midmod_cog_fat_inc = (midmod.data.midmod_cog_fat_inc - midmod.data.midmod_cog_fat_resp_inc) # cog_resp_inc = cog_resp_inc - cog_fat_resp_inc midmod.data.midmod_cog_resp_inc = (midmod.data.midmod_cog_resp_inc - midmod.data.midmod_cog_fat_resp_inc) # fat_resp_inc = fat_resp_inc - cog_fat_resp_inc midmod.data.midmod_fat_resp_inc = (midmod.data.midmod_fat_resp_inc - midmod.data.midmod_cog_fat_resp_inc) # mild/moderate long-term prevalence = mild/moderate long-term incidence * [duration] midmod.data = calc_prev(df=midmod.data, dp=dp, dst_population='midmod', dst_outcome='cognitive', calc_col_stub='midmod_cog_') midmod.data = calc_prev(df=midmod.data, dp=dp, dst_population='midmod', dst_outcome='fatigue', calc_col_stub='midmod_fat_') midmod.data = calc_prev(df=midmod.data, dp=dp, dst_population='midmod', dst_outcome='respiratory', calc_col_stub='midmod_resp_') midmod.data = calc_prev(df=midmod.data, dp=dp, dst_population='midmod', dst_outcome='cognitive_fatigue', calc_col_stub='midmod_cog_fat_') midmod.data = calc_prev(df=midmod.data, dp=dp, dst_population='midmod', dst_outcome='cognitive_respiratory', calc_col_stub='midmod_cog_resp_') midmod.data = calc_prev(df=midmod.data, dp=dp, dst_population='midmod', dst_outcome='fatigue_respiratory', calc_col_stub='midmod_fat_resp_') midmod.data = calc_prev(df=midmod.data, dp=dp, dst_population='midmod', dst_outcome='cognitive_fatigue_respiratory', calc_col_stub='midmod_cog_fat_resp_') # Drop unneeded cols midmod.data = midmod.data.drop( columns=['midmod_inc', 'hospital_inc', 'midmod_risk_num']) # endregion ---------------------------------------------------------------- print('Calculating severe incidence and prevalence...') ## Severe Incidence & Prevalence # region ------------------------------------------------------------------- # Shift icu admissions lag_icu = copy.deepcopy(icu) lag_icu.data = lag_icu.data.drop(columns=['icu_deaths']) lag_icu.data.date = lag_icu.data.date + pd.to_timedelta( roots['defaults']['icu_to_death_duration'], unit='D') # Shift hospital deaths lag_hsp = copy.deepcopy(hospital) lag_hsp.data = lag_hsp.data.drop(columns=['hospital_inc']) lag_hsp.data.date = lag_hsp.data.date + pd.to_timedelta( roots['defaults']['hsp_no_icu_death_duration'], unit='D') # Merge lagged datasets lag = pd.merge( lag_icu.data, lag_hsp.data, how='left', on=['location_id', 'age_group_id', 'sex_id', 'draw_var', 'date']) del lag_icu, lag_hsp hospital.data = pd.merge( hospital.data.drop(columns=['hospital_deaths']), lag, how='left', on=['location_id', 'age_group_id', 'sex_id', 'draw_var', 'date']) del lag # severe at risk number = (hospital admissions - ICU admissions|3 days later - hospital deaths|6 days later) | # shift forward by {hospital duration if no ICU no death + hospital mild moderate duration after discharge} hospital.data['hospital_risk_num'] = (hospital.data.hospital_inc - hospital.data.icu_inc - hospital.data.hospital_deaths) hospital.data.date = hospital.data.date + pd.to_timedelta( (roots['defaults']['hsp_no_icu_no_death_duration'] + roots['defaults']['hsp_midmod_after_discharge_duration']), unit='D') # Calculate the incidence of each symptom and overlap, regardless of co-occurrence of additional symptoms (not mutually exclusive) # severe long-term incidence = severe at risk number * proportion of severe survivors with each long-term symptom cluster hospital.data['hospital_cog_inc'] = ( hospital.data.hospital_risk_num * dp.loc[(dp.outcome == 'cognitive') & (dp.population == 'hospital'), 'proportion_mean'].values[0]) hospital.data['hospital_fat_inc'] = ( hospital.data.hospital_risk_num * dp.loc[(dp.outcome == 'fatigue') & (dp.population == 'hospital'), 'proportion_mean'].values[0]) hospital.data['hospital_resp_inc'] = ( hospital.data.hospital_risk_num * dp.loc[(dp.outcome == 'respiratory') & (dp.population == 'hospital'), 'proportion_mean'].values[0]) hospital.data['hospital_cog_fat_inc'] = ( hospital.data.hospital_risk_num * dp.loc[(dp.outcome == 'cognitive_fatigue') & (dp.population == 'hospital'), 'proportion_mean'].values[0]) hospital.data['hospital_cog_resp_inc'] = ( hospital.data.hospital_risk_num * dp.loc[(dp.outcome == 'cognitive_respiratory') & (dp.population == 'hospital'), 'proportion_mean'].values[0]) hospital.data['hospital_fat_resp_inc'] = ( hospital.data.hospital_risk_num * dp.loc[(dp.outcome == 'fatigue_respiratory') & (dp.population == 'hospital'), 'proportion_mean'].values[0]) hospital.data['hospital_cog_fat_resp_inc'] = ( hospital.data.hospital_risk_num * dp.loc[(dp.outcome == 'cognitive_fatigue_respiratory') & (dp.population == 'hospital'), 'proportion_mean'].values[0]) # Creating mutually exclusive categories of symptoms # cog_inc = cog_inc - (cog_fat_inc - cog_fat_resp_inc) - (cog_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc hospital.data.hospital_cog_inc = ( hospital.data.hospital_cog_inc - (hospital.data.hospital_cog_fat_inc - hospital.data.hospital_cog_fat_resp_inc) - (hospital.data.hospital_cog_resp_inc - hospital.data.hospital_cog_fat_resp_inc) - hospital.data.hospital_cog_fat_resp_inc) # fat_inc = fat_inc - (cog_fat_inc - cog_fat_resp_inc) - (fat_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc hospital.data.hospital_fat_inc = ( hospital.data.hospital_fat_inc - (hospital.data.hospital_cog_fat_inc - hospital.data.hospital_cog_fat_resp_inc) - (hospital.data.hospital_fat_resp_inc - hospital.data.hospital_cog_fat_resp_inc) - hospital.data.hospital_cog_fat_resp_inc) # resp_inc = resp_inc - (fat_resp_inc - cog_fat_resp_inc) - (cog_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc hospital.data.hospital_resp_inc = ( hospital.data.hospital_resp_inc - (hospital.data.hospital_fat_resp_inc - hospital.data.hospital_cog_fat_resp_inc) - (hospital.data.hospital_cog_resp_inc - hospital.data.hospital_cog_fat_resp_inc) - hospital.data.hospital_cog_fat_resp_inc) # cog_fat_inc = cog_fat_inc - cog_fat_resp_inc hospital.data.hospital_cog_fat_inc = ( hospital.data.hospital_cog_fat_inc - hospital.data.hospital_cog_fat_resp_inc) # cog_resp_inc = cog_resp_inc - cog_fat_resp_inc hospital.data.hospital_cog_resp_inc = ( hospital.data.hospital_cog_resp_inc - hospital.data.hospital_cog_fat_resp_inc) # fat_resp_inc = fat_resp_inc - cog_fat_resp_inc hospital.data.hospital_fat_resp_inc = ( hospital.data.hospital_fat_resp_inc - hospital.data.hospital_cog_fat_resp_inc) # severe long-term prevalence = severe long-term incidence * [duration] hospital.data = calc_prev(df=hospital.data, dp=dp, dst_population='hospital', dst_outcome='cognitive', calc_col_stub='hospital_cog_') hospital.data = calc_prev(df=hospital.data, dp=dp, dst_population='hospital', dst_outcome='fatigue', calc_col_stub='hospital_fat_') hospital.data = calc_prev(df=hospital.data, dp=dp, dst_population='hospital', dst_outcome='respiratory', calc_col_stub='hospital_resp_') hospital.data = calc_prev(df=hospital.data, dp=dp, dst_population='hospital', dst_outcome='cognitive_fatigue', calc_col_stub='hospital_cog_fat_') hospital.data = calc_prev(df=hospital.data, dp=dp, dst_population='hospital', dst_outcome='cognitive_respiratory', calc_col_stub='hospital_cog_resp_') hospital.data = calc_prev(df=hospital.data, dp=dp, dst_population='hospital', dst_outcome='fatigue_respiratory', calc_col_stub='hospital_fat_resp_') hospital.data = calc_prev(df=hospital.data, dp=dp, dst_population='hospital', dst_outcome='cognitive_fatigue_respiratory', calc_col_stub='hospital_cog_fat_resp_') # Remove unneeded cols hospital.data = hospital.data.drop(columns=[ 'hospital_inc', 'icu_inc', 'hospital_deaths', 'hospital_risk_num' ]) # endregion ---------------------------------------------------------------- print('Calculating critical incidence and prevalence...') ## Critical Incidence & Prevalence # region ------------------------------------------------------------------- # Shift icu deaths lag_icu = copy.deepcopy(icu) lag_icu.data = lag_icu.data.drop(columns='icu_inc') lag_icu.data.date = lag_icu.data.date + pd.to_timedelta( roots['defaults']['icu_to_death_duration'], unit='D') # Merge icu and lag_icu icu.data = pd.merge( icu.data.drop(columns='icu_deaths'), lag_icu.data, how='left', on=['location_id', 'age_group_id', 'sex_id', 'draw_var', 'date']) del lag_icu # critical at risk number = (ICU admissions - ICU deaths|3 days later) | # shift forward by {ICU duration if no death + ICU mild moderate duration after discharge} icu.data['icu_risk_num'] = icu.data.icu_inc - icu.data.icu_deaths icu.data.date = icu.data.date - pd.to_timedelta( (roots['defaults']['icu_no_death_duration'] + roots['defaults']['icu_midmod_after_discharge_duration']), unit='D') # Calculate the incidence of each symptom and overlap, regardless of co-occurrence of additional symptoms (not mutually exclusive) # critical long-term incidence = critical number at risk * proportion of critical with each long-term symptom cluster icu.data['icu_cog_inc'] = ( icu.data.icu_risk_num * dp.loc[(dp.outcome == 'cognitive') & (dp.population == 'icu'), 'proportion_mean'].values[0]) icu.data['icu_fat_inc'] = ( icu.data.icu_risk_num * dp.loc[(dp.outcome == 'fatigue') & (dp.population == 'icu'), 'proportion_mean'].values[0]) icu.data['icu_resp_inc'] = ( icu.data.icu_risk_num * dp.loc[(dp.outcome == 'respiratory') & (dp.population == 'icu'), 'proportion_mean'].values[0]) icu.data['icu_cog_fat_inc'] = ( icu.data.icu_risk_num * dp.loc[(dp.outcome == 'cognitive_fatigue') & (dp.population == 'icu'), 'proportion_mean'].values[0]) icu.data['icu_cog_resp_inc'] = ( icu.data.icu_risk_num * dp.loc[(dp.outcome == 'cognitive_respiratory') & (dp.population == 'icu'), 'proportion_mean'].values[0]) icu.data['icu_fat_resp_inc'] = ( icu.data.icu_risk_num * dp.loc[(dp.outcome == 'fatigue_respiratory') & (dp.population == 'icu'), 'proportion_mean'].values[0]) icu.data['icu_cog_fat_resp_inc'] = ( icu.data.icu_risk_num * dp.loc[(dp.outcome == 'cognitive_fatigue_respiratory') & (dp.population == 'icu'), 'proportion_mean'].values[0]) # Creating mutually exclusive categories of symptoms # cog_inc = cog_inc - (cog_fat_inc - cog_fat_resp_inc) - (cog_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc icu.data.icu_cog_inc = ( icu.data.icu_cog_inc - (icu.data.icu_cog_fat_inc - icu.data.icu_cog_fat_resp_inc) - (icu.data.icu_cog_resp_inc - icu.data.icu_cog_fat_resp_inc) - icu.data.icu_cog_fat_resp_inc) # fat_inc = fat_inc - (cog_fat_inc - cog_fat_resp_inc) - (fat_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc icu.data.icu_fat_inc = ( icu.data.icu_fat_inc - (icu.data.icu_cog_fat_inc - icu.data.icu_cog_fat_resp_inc) - (icu.data.icu_fat_resp_inc - icu.data.icu_cog_fat_resp_inc) - icu.data.icu_cog_fat_resp_inc) # resp_inc = resp_inc - (fat_resp_inc - cog_fat_resp_inc) - (cog_resp_inc - cog_fat_resp_inc) - cog_fat_resp_inc icu.data.icu_resp_inc = ( icu.data.icu_resp_inc - (icu.data.icu_fat_resp_inc - icu.data.icu_cog_fat_resp_inc) - (icu.data.icu_cog_resp_inc - icu.data.icu_cog_fat_resp_inc) - icu.data.icu_cog_fat_resp_inc) # cog_fat_inc = cog_fat_inc - cog_fat_resp_inc icu.data.icu_cog_fat_inc = (icu.data.icu_cog_fat_inc - icu.data.icu_cog_fat_resp_inc) # cog_resp_inc = cog_resp_inc - cog_fat_resp_inc icu.data.icu_cog_resp_inc = (icu.data.icu_cog_resp_inc - icu.data.icu_cog_fat_resp_inc) # fat_resp_inc = fat_resp_inc - cog_fat_resp_inc icu.data.icu_fat_resp_inc = (icu.data.icu_fat_resp_inc - icu.data.icu_cog_fat_resp_inc) # critical long-term prevalence = critical long-term incidence * [duration] icu.data = calc_prev(df=icu.data, dp=dp, dst_population='icu', dst_outcome='cognitive', calc_col_stub='icu_cog_') icu.data = calc_prev(df=icu.data, dp=dp, dst_population='icu', dst_outcome='fatigue', calc_col_stub='icu_fat_') icu.data = calc_prev(df=icu.data, dp=dp, dst_population='icu', dst_outcome='respiratory', calc_col_stub='icu_resp_') icu.data = calc_prev(df=icu.data, dp=dp, dst_population='icu', dst_outcome='cognitive_fatigue', calc_col_stub='icu_cog_fat_') icu.data = calc_prev(df=icu.data, dp=dp, dst_population='icu', dst_outcome='cognitive_respiratory', calc_col_stub='icu_cog_resp_') icu.data = calc_prev(df=icu.data, dp=dp, dst_population='icu', dst_outcome='fatigue_respiratory', calc_col_stub='icu_fat_resp_') icu.data = calc_prev(df=icu.data, dp=dp, dst_population='icu', dst_outcome='cognitive_fatigue_respiratory', calc_col_stub='icu_cog_fat_resp_') # Remove unneeded cols icu.data = icu.data.drop(columns=['icu_inc', 'icu_deaths', 'icu_risk_num']) del dp # endregion ---------------------------------------------------------------- print('Aggregating severities...') ## Aggregate Severities # region ------------------------------------------------------------------- df = copy.deepcopy(midmod) del midmod df.data = pd.merge( df.data, hospital.data, how='outer', on=['location_id', 'age_group_id', 'sex_id', 'draw_var', 'date']) del hospital df.data = pd.merge( df.data, icu.data, how='outer', on=['location_id', 'age_group_id', 'sex_id', 'draw_var', 'date']) del icu # Incidence df.data['cognitive_inc'] = df.data[[ 'midmod_cog_inc', 'hospital_cog_inc', 'icu_cog_inc' ]].sum(axis=1) df.data.drop(columns=['midmod_cog_inc', 'hospital_cog_inc', 'icu_cog_inc'], inplace=True) df.data['fatigue_inc'] = df.data[[ 'midmod_fat_inc', 'hospital_fat_inc', 'icu_fat_inc' ]].sum(axis=1) df.data.drop(columns=['midmod_fat_inc', 'hospital_fat_inc', 'icu_fat_inc'], inplace=True) df.data['respiratory_inc'] = df.data[[ 'midmod_resp_inc', 'hospital_resp_inc', 'icu_resp_inc' ]].sum(axis=1) df.data.drop( columns=['midmod_resp_inc', 'hospital_resp_inc', 'icu_resp_inc'], inplace=True) df.data['cognitive_fatigue_inc'] = df.data[[ 'midmod_cog_fat_inc', 'hospital_cog_fat_inc', 'icu_cog_fat_inc' ]].sum(axis=1) df.data.drop(columns=[ 'midmod_cog_fat_inc', 'hospital_cog_fat_inc', 'icu_cog_fat_inc' ], inplace=True) df.data['cognitive_respiratory_inc'] = df.data[[ 'midmod_cog_resp_inc', 'hospital_cog_resp_inc', 'icu_cog_resp_inc' ]].sum(axis=1) df.data.drop(columns=[ 'midmod_cog_resp_inc', 'hospital_cog_resp_inc', 'icu_cog_resp_inc' ], inplace=True) df.data['fatigue_respiratory_inc'] = df.data[[ 'midmod_fat_resp_inc', 'hospital_fat_resp_inc', 'icu_fat_resp_inc' ]].sum(axis=1) df.data.drop(columns=[ 'midmod_fat_resp_inc', 'hospital_fat_resp_inc', 'icu_fat_resp_inc' ], inplace=True) df.data['cognitive_fatigue_respiratory_inc'] = df.data[[ 'midmod_cog_fat_resp_inc', 'hospital_cog_fat_resp_inc', 'icu_cog_fat_resp_inc' ]].sum(axis=1) df.data.drop(columns=[ 'midmod_cog_fat_resp_inc', 'hospital_cog_fat_resp_inc', 'icu_cog_fat_resp_inc' ], inplace=True) # Prevalence df.data['cognitive_prev'] = df.data[[ 'midmod_cog_prev', 'hospital_cog_prev', 'icu_cog_prev' ]].sum(axis=1) df.data.drop( columns=['midmod_cog_prev', 'hospital_cog_prev', 'icu_cog_prev'], inplace=True) df.data['fatigue_prev'] = df.data[[ 'midmod_fat_prev', 'hospital_fat_prev', 'icu_fat_prev' ]].sum(axis=1) df.data.drop( columns=['midmod_fat_prev', 'hospital_fat_prev', 'icu_fat_prev'], inplace=True) df.data['respiratory_prev'] = df.data[[ 'midmod_resp_prev', 'hospital_resp_prev', 'icu_resp_prev' ]].sum(axis=1) df.data.drop( columns=['midmod_resp_prev', 'hospital_resp_prev', 'icu_resp_prev'], inplace=True) df.data['cognitive_fatigue_prev'] = df.data[[ 'midmod_cog_fat_prev', 'hospital_cog_fat_prev', 'icu_cog_fat_prev' ]].sum(axis=1) df.data.drop(columns=[ 'midmod_cog_fat_prev', 'hospital_cog_fat_prev', 'icu_cog_fat_prev' ], inplace=True) df.data['cognitive_respiratory_prev'] = df.data[[ 'midmod_cog_resp_prev', 'hospital_cog_resp_prev', 'icu_cog_resp_prev' ]].sum(axis=1) df.data.drop(columns=[ 'midmod_cog_resp_prev', 'hospital_cog_resp_prev', 'icu_cog_resp_prev' ], inplace=True) df.data['fatigue_respiratory_prev'] = df.data[[ 'midmod_fat_resp_prev', 'hospital_fat_resp_prev', 'icu_fat_resp_prev' ]].sum(axis=1) df.data.drop(columns=[ 'midmod_fat_resp_prev', 'hospital_fat_resp_prev', 'icu_fat_resp_prev' ], inplace=True) df.data['cognitive_fatigue_respiratory_prev'] = df.data[[ 'midmod_cog_fat_resp_prev', 'hospital_cog_fat_resp_prev', 'icu_cog_fat_resp_prev' ]].sum(axis=1) df.data.drop(columns=[ 'midmod_cog_fat_resp_prev', 'hospital_cog_fat_resp_prev', 'icu_cog_fat_resp_prev' ], inplace=True) # endregion ---------------------------------------------------------------- print('Aggregating by year...') ## Aggregate by year # region ------------------------------------------------------------------- # Subset to 2020 df.data = df.data[(df.data.date >= datetime.datetime(2020, 1, 1)) & (df.data.date <= datetime.datetime(2020, 12, 31))] # Sum by day df.collapse( agg_function='sum', group_cols=['location_id', 'age_group_id', 'sex_id', 'draw_var'], calc_cols=[ 'cognitive_inc', 'cognitive_prev', 'fatigue_inc', 'fatigue_prev', 'respiratory_inc', 'respiratory_prev', 'cognitive_fatigue_inc', 'cognitive_fatigue_prev', 'cognitive_respiratory_inc', 'cognitive_respiratory_prev', 'fatigue_respiratory_inc', 'fatigue_respiratory_prev', 'cognitive_fatigue_respiratory_inc', 'cognitive_fatigue_respiratory_prev' ]) # Divide prevalence by 366 df.data.cognitive_prev = df.data.cognitive_prev / 366 df.data.fatigue_prev = df.data.fatigue_prev / 366 df.data.respiratory_prev = df.data.respiratory_prev / 366 df.data.cognitive_fatigue_prev = df.data.cognitive_fatigue_prev / 366 df.data.cognitive_respiratory_prev = df.data.cognitive_respiratory_prev / 366 df.data.fatigue_respiratory_prev = df.data.fatigue_respiratory_prev / 366 df.data.cognitive_fatigue_respiratory_prev = df.data.cognitive_fatigue_respiratory_prev / 366 # Ensure incidence and prevalence aren't negative df.check_neg(calc_cols=[ 'cognitive_inc', 'cognitive_prev', 'fatigue_inc', 'fatigue_prev', 'respiratory_inc', 'respiratory_prev', 'cognitive_fatigue_inc', 'cognitive_fatigue_prev', 'cognitive_respiratory_inc', 'cognitive_respiratory_prev', 'fatigue_respiratory_inc', 'fatigue_respiratory_prev', 'cognitive_fatigue_respiratory_inc', 'cognitive_fatigue_respiratory_prev' ]) # endregion ---------------------------------------------------------------- print('Calculating rates...') ## Calculate rates # region ------------------------------------------------------------------- # Pull population pop = get_population(age_group_id=roots['age_groups'], single_year_age=False, location_id=loc_id, location_set_id=35, year_id=roots['gbd_year'], sex_id=[1, 2], gbd_round_id=roots['gbd_round'], status='best', decomp_step=roots['decomp_step']) pop.drop(columns=['year_id', 'run_id'], inplace=True) # Merge population df.data = pd.merge(df.data, pop, how='left', on=['location_id', 'age_group_id', 'sex_id']) # Calculate rates df.data['cognitive_inc_rate'] = df.data.cognitive_inc / df.data.population df.data['fatigue_inc_rate'] = df.data.fatigue_inc / df.data.population df.data[ 'respiratory_inc_rate'] = df.data.respiratory_inc / df.data.population df.data[ 'cognitive_fatigue_inc_rate'] = df.data.cognitive_fatigue_inc / df.data.population df.data[ 'cognitive_respiratory_inc_rate'] = df.data.cognitive_respiratory_inc / df.data.population df.data[ 'fatigue_respiratory_inc_rate'] = df.data.fatigue_respiratory_inc / df.data.population df.data[ 'cognitive_fatigue_respiratory_inc_rate'] = df.data.cognitive_fatigue_respiratory_inc / df.data.population df.data[ 'cognitive_prev_rate'] = df.data.cognitive_prev / df.data.population df.data['fatigue_prev_rate'] = df.data.fatigue_prev / df.data.population df.data[ 'respiratory_prev_rate'] = df.data.respiratory_prev / df.data.population df.data[ 'cognitive_fatigue_prev_rate'] = df.data.cognitive_fatigue_prev / df.data.population df.data[ 'cognitive_respiratory_prev_rate'] = df.data.cognitive_respiratory_prev / df.data.population df.data[ 'fatigue_respiratory_prev_rate'] = df.data.fatigue_respiratory_prev / df.data.population df.data[ 'cognitive_fatigue_respiratory_prev_rate'] = df.data.cognitive_fatigue_respiratory_prev / df.data.population # endregion ---------------------------------------------------------------- print('Calculating YLDs...') ## Calculate YLDs # region ------------------------------------------------------------------- # Read in disability weights dw = pd.read_csv('{}dws.csv'.format(roots['disability_weight'])) # Temporary values df.data['cognitive_YLD'] = df.data.cognitive_prev_rate * 0.01 df.data['fatigue_YLD'] = df.data.fatigue_prev_rate * 0.01 df.data['respiratory_YLD'] = df.data.respiratory_prev_rate * 0.01 df.data[ 'cognitive_fatigue_YLD'] = df.data.cognitive_fatigue_prev_rate * 0.01 df.data[ 'cognitive_respiratory_YLD'] = df.data.cognitive_respiratory_prev_rate * 0.01 df.data[ 'fatigue_respiratory_YLD'] = df.data.fatigue_respiratory_prev_rate * 0.01 df.data[ 'cognitive_fatigue_respiratory_YLD'] = df.data.cognitive_fatigue_respiratory_prev_rate * 0.01 del dw # endregion ---------------------------------------------------------------- print('Saving datasets and running diagnostics...') ## Save datasets & run diagnostics # region ------------------------------------------------------------------- # Cognitive df.save_data(output_cols=[ 'location_id', 'age_group_id', 'sex_id', 'draw_var', 'cognitive_inc', 'cognitive_prev', 'cognitive_inc_rate', 'cognitive_prev_rate', 'cognitive_YLD' ], filename='cognitive', stage='stage_2') # Fatigue df.save_data(output_cols=[ 'location_id', 'age_group_id', 'sex_id', 'draw_var', 'fatigue_inc', 'fatigue_prev', 'fatigue_inc_rate', 'fatigue_prev_rate', 'fatigue_YLD' ], filename='fatigue', stage='stage_2') # Respiratory df.save_data(output_cols=[ 'location_id', 'age_group_id', 'sex_id', 'draw_var', 'respiratory_inc', 'respiratory_prev', 'respiratory_inc_rate', 'respiratory_prev_rate', 'respiratory_YLD' ], filename='respiratory', stage='stage_2') # Cognitive Fatigue df.save_data(output_cols=[ 'location_id', 'age_group_id', 'sex_id', 'draw_var', 'cognitive_fatigue_inc', 'cognitive_fatigue_prev', 'cognitive_fatigue_inc_rate', 'cognitive_fatigue_prev_rate', 'cognitive_fatigue_YLD' ], filename='cognitive_fatigue', stage='stage_2') # Cognitive Respiratory df.save_data(output_cols=[ 'location_id', 'age_group_id', 'sex_id', 'draw_var', 'cognitive_respiratory_inc', 'cognitive_respiratory_prev', 'cognitive_respiratory_inc_rate', 'cognitive_respiratory_prev_rate', 'cognitive_respiratory_YLD' ], filename='cognitive_respiratory', stage='stage_2') # Fatigue Respiratory df.save_data(output_cols=[ 'location_id', 'age_group_id', 'sex_id', 'draw_var', 'fatigue_respiratory_inc', 'fatigue_respiratory_prev', 'fatigue_respiratory_inc_rate', 'fatigue_respiratory_prev_rate', 'fatigue_respiratory_YLD' ], filename='fatigue_respiratory', stage='stage_2') # Cognitive Fatigue Respiratory df.save_data(output_cols=[ 'location_id', 'age_group_id', 'sex_id', 'draw_var', 'cognitive_fatigue_respiratory_inc', 'cognitive_fatigue_respiratory_prev', 'cognitive_fatigue_respiratory_inc_rate', 'cognitive_fatigue_respiratory_prev_rate', 'cognitive_fatigue_respiratory_YLD' ], filename='cognitive_fatigue_respiratory', stage='stage_2')
def main(): """ Will generate a dictionary as follows: <key> patientid : <value> lsit of dicts, where each dict contains admission data [ {<key> feature/label name : <value> feature/label value} ] """ parser = argparse.ArgumentParser(description='Generate Text+Code dataset') parser.add_argument( '-p', '--path', default=None, type=str, help='path to pandas dataframe where rows are admissions') parser.add_argument( '-vp', '--vocab_path', default='', type=str, help= 'path to where code vocabulary are stored assumes diagnoses vocab file named as diag.vocab and cpt vocab as cpt.vocab' ) parser.add_argument('-s', '--save', default='./', type=str, help='path to save pkl files') parser.add_argument('-et', '--embed_text', default=False, action='store_true', help='flag wether to embed text or not') parser.add_argument('-cpb', '--bert_config_path', default=None, type=str, help='path to bert config') parser.add_argument('-vpb', '--bert_vocab_path', default=None, type=str, help='path to bert vocab ') parser.add_argument('-sdp', '--state_dict_path', default=None, type=str, help='path to bert state dict') parser.add_argument('-gpu', '--gpu', default=0, type=int) parser.add_argument('-bsl', '--max_bert_seq_len', default=512, type=int, help='maximum sequence length of bert model') parser.add_argument( '-tsld', '--text_seq_length_discharge', default=0, type=int, help= 'pass this if maximum text sequence length is known for discharge text to avoid long processing time' ) parser.add_argument( '-tslr', '--text_seq_length_rest', default=0, type=int, help= 'pass this if maximum text sequence length is known for rest of text (other than discharge) to avoid longer processing time' ) parser.add_argument('-sc', '--short_code', default=False, action='store_true', help='flag for using short codes ') parser.add_argument('-diag', '--diagnoses', default=False, action='store_true', help='flag for including diagnoses codes') parser.add_argument('-proc', '--procedures', default=False, action='store_true', help='flag for including procedures codes') parser.add_argument('-med', '--medications', default=False, action='store_true', help='flag for including medication codes') parser.add_argument('-cpt', '--cpts', default=False, action='store_true', help='flag for including cpt codes') parser.add_argument('-ma', '--min_adm', default=0, type=int) args = parser.parse_args() df = pd.read_pickle(args.path) df_orig = df # remove organ donor admissions if ('DIAGNOSIS' in df.columns): REMOVE_DIAGNOSIS = ~((df['DIAGNOSIS'] == 'ORGAN DONOR ACCOUNT') | (df['DIAGNOSIS'] == 'ORGAN DONOR') | \ (df['DIAGNOSIS'] == 'DONOR ACCOUNT')) df = df[REMOVE_DIAGNOSIS] df = df[~df['ICD9_CODE'].isna()] # drop patients with no icd9 code? df = df[~(df['TEXT_REST'].isna() | df['TEXT_REST'].isna())] if ('TIMEDELTA' in df.columns): df['TIMEDELTA'] = df['TIMEDELTA'].fillna(pd.to_timedelta("0")) df['TIMEDELTA'] = pd.to_timedelta(df['TIMEDELTA']) df['TIMEDELTA'] = df['TIMEDELTA'].apply(lambda x: x.seconds) pids = list(set(df['SUBJECT_ID'].tolist())) # lambda demographic_cols = { 'AGE': [], 'GENDER': [], 'LAST_CAREUNIT': [], 'MARITAL_STATUS': [], 'ETHNICITY': [], 'DISCHARGE_LOCATION': [] } df.loc[:, 'MARITAL_STATUS'], demographic_cols[ 'MARITAL_STATUS'] = pd.factorize(df['MARITAL_STATUS']) df.loc[:, 'ETHNICITY'], demographic_cols['ETHNICITY'] = pd.factorize( df['ETHNICITY']) df.loc[:, 'DISCHARGE_LOCATION'], demographic_cols[ 'DISCHARGE_LOCATION'] = pd.factorize(df['DISCHARGE_LOCATION']) df.loc[:, 'LAST_CAREUNIT'], demographic_cols['LAST_CAREUNIT'] = pd.factorize( df['LAST_CAREUNIT']) df.loc[:, 'GENDER'], demographic_cols['GENDER'] = pd.factorize(df['GENDER']) df.loc[:, 'AGE'] = df['AGE'].astype(int) los_bins = [1, 2, 3, 4, 5, 6, 7, 8, 14, float('inf')] los_labels = [1, 2, 3, 4, 5, 6, 7, 8, 9] df.loc[:, 'LOS'] = pd.cut(df['LOS'], bins=los_bins, labels=los_labels) temp_data = [] data = {} diag_vocab = Vocab() cpt_vocab = Vocab() med_vocab = Vocab() proc_vocab = Vocab() if (args.vocab_path != ''): #to use below checkout https://github.com/sajaddarabi/HCUP-US-EHR if (args.diagnoses): diag_vocab._build_from_file( os.path.join(args.vocab_path, 'diag.vocab')) if (args.cpts): cpt_vocab._build_from_file( os.path.join(args.vocab_path, 'cpt.vocab')) #if (args.procedures): # proc_vocab._build_from_file(os.path.join(args.vocab_path, 'proc.vocab')) #if (args.med): #med_vocab._build_from_file(os.path.join(args.vocab_path, 'med.vocab')) if (os.path.exists(os.path.join(args.save, 'data.pkl'))): temp_data = pickle.load(open(os.path.join(args.save, 'data.pkl'), 'rb')) temp_data = temp_data['data'] t = list(temp_data.keys()) t = t[0] d = 'text_embedding' in temp_data[t][0] if (not d): temp_data = [] else: model = None bert_config = None torch.cuda.empty_cache() if args.embed_text: tokenizer = BertTokenizer(args.bert_vocab_path) if args.embed_text and (len(temp_data) == 0): bert_config = BertConfig(args.bert_config_path) model = BertTextModel(bert_config) state_dict = torch.load(args.state_dict_path) model.init_bert_weights(state_dict) device, _ = _prepare_device(args.gpu) model = model.to(device) max_seq_len_text_d = args.text_seq_length_discharge max_seq_len_text_r = args.text_seq_length_rest if max_seq_len_text_d == 0: max_seq_len_text = compute_max_seq_len_text( df, 'TEXT_DISCHARGE', tokenizer) max_seq_len_text = max_seq_len_text // args.max_bert_seq_len + 1 max_seq_len_text_d = max_seq_len_text print("text sequence discharge length: {}".format( max_seq_len_text_d)) if max_seq_len_text_r == 0: max_seq_len_text = compute_max_seq_len_text( df, 'TEXT_REST', tokenizer) max_seq_len_text = max_seq_len_text // args.max_bert_seq_len + 1 max_seq_len_text_r = max_seq_len_text print("text sequence rest length: {}".format(max_seq_len_text_r)) try: for pid in tqdm(pids): pid_df = df[df['SUBJECT_ID'] == pid] pid_df = pid_df.sort_values('ADMITTIME').reset_index() if (len(pid_df) < 1): # must atleast have two data points continue data[pid] = [] t = 0 hadm_ids = set(df['HADM_ID']) for i, r in pid_df.iterrows(): #filt notes prior to n days and concatenate them # leave discharge summary seperate admit_data = {} demographics = [r['AGE'], r['GENDER'], r['MARITAL_STATUS']] icu_unit = np.zeros((demographic_cols['LAST_CAREUNIT'].size, ), dtype=int) icu_unit[r['LAST_CAREUNIT']] = 1 demographics += list(icu_unit) ethnicity = np.zeros((demographic_cols['ETHNICITY'].size, ), dtype=int) ethnicity[r['ETHNICITY']] = 1 demographics += list(ethnicity) ethnicity = np.zeros((demographic_cols['ETHNICITY'].size, ), dtype=int) ethnicity[r['ETHNICITY']] = 1 demographics += list(ethnicity) admit_data['demographics'] = demographics dtok, ptok, mtok, ctok = [], [], [], [] diagnosis_codes, proc_codes, med_codes, cpt_codes = np.nan, np.nan, np.nan, np.nan if args.diagnoses: diagnosis_codes = r['ICD9_CODE'] if (diagnosis_codes == diagnosis_codes): dtok = diag_vocab.convert_to_ids(diagnosis_codes, 'D', args.short_code) if (args.procedures): proc_codes = r['ICD9_CODE_PROCEDURE'] if (proc_codes == proc_codes): ptok = proc_vocab.convert_to_ids(proc_codes, 'P', args.short_code) if args.medications: med_codes = r[ 'NDC'] # issue with NDC what mapping version is being used..? if (med_codes == med_codes): mtok = med_vocab.convert_to_ids(med_codes, 'M') if args.cpts: cpt_codes = r['CPT_CD'] if (cpt_codes == cpt_codes): ctok = cpt_vocab.convert_to_ids(cpt_codes, 'C') admit_data['diagnoses'] = dtok admit_data['procedures'] = ptok admit_data['medications'] = mtok admit_data['cptproc'] = ctok if (r['TIMEDELTA'] == r['TIMEDELTA']): t += r['TIMEDELTA'] admit_data['timedelta'] = t text_discharge = r['TEXT_DISCHARGE'] text_rest = r['TEXT_REST'] ttokd = tokenizer.tokenize(text_discharge) ttokd = tokenizer.convert_tokens_to_ids(ttokd) ttokr = tokenizer.tokenize(text_rest) ttokr = tokenizer.convert_tokens_to_ids(ttokr) admit_data['text_discharge_raw'] = text_discharge admit_data['text_rest_raw'] = text_rest admit_data['text_discharge_len'] = len(ttokd) admit_data['text_rest_len'] = len(ttokr) admit_data['text_discharge_token'] = ttokd admit_data['text_rest_token'] = ttokr if len(temp_data) == 0: if (args.embed_text): ttokd = embed_text(ttokd, device, model, args.max_bert_seq_len, max_seq_len_text_d) ttokd = ttokd.cpu().numpy() ttokr = embed_text(ttokr, device, model, args.max_bert_seq_len, max_seq_len_text_r) ttokr = ttokr.cpu().numpy() else: ttok = temp_data[pid][i]['text_embedding'] admit_data['text_embedding_discharge'] = ttokd admit_data['text_embedding_rest'] = ttokr admit_data['los'] = r['LOS'] admit_data['readmission'] = r['readmission_label'] admit_data['mortality'] = r['DEATHTIME'] == r['DEATHTIME'] data[pid].append(admit_data) except Exception as error: print(error) import pdb pdb.set_trace() if (not os.path.exists(args.save)): os.makedirs(args.save) # temporarly save data incase something goes wrong ... try: with open(os.path.join(args.save, 'data.pkl'), 'wb') as handle: data_dict = {} data_dict['data'] = data pickle.dump(data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) except: import pdb pdb.set_trace() pids = list(data.keys()) flatten = lambda x: [item for sublist in x for item in sublist] data_info = {} num_icd9_codes, num_proc_codes, num_med_codes = 0, 0, 0 data_info['num_patients'] = len(pids) data_info['max_seq_len_text_d'] = max_seq_len_text_d data_info['max_seq_len_text_r'] = max_seq_len_text_r data_info['num_icd9_codes'] = 0 data_info['num_proc_codes'] = 0 data_info['num_med_codes'] = 0 if (args.diagnoses): num_icd9_codes = len(set(flatten(df_orig['ICD9_CODE'].dropna()))) data_info['num_icd9_codes'] = num_icd9_codes if (args.procedures): num_proc_codes = len( set(flatten(df_orig['ICD9_CODE_PROCEDURE'].dropna()))) data_info['num_proc_codes'] = num_proc_codes if (args.medications): num_med_codes = len(set(flatten(df_orig['NDC'].dropna()))) data_info['num_med_codes'] = num_med_codes data_info['demographics_shape'] = len(data[pids[0]][0]['demographics']) data_info['demographic_cols'] = demographic_cols data_info['total_codes'] = data_info['num_icd9_codes'] + data_info[ 'num_proc_codes'] + data_info['num_med_codes'] if (not os.path.exists(args.save)): os.makedirs(args.save) with open(os.path.join(args.save, 'data.pkl'), 'wb') as handle: data_dict = {} data_dict['info'] = data_info data_dict['data'] = data pickle.dump(data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(args.save, 'cpt_vocab.pkl'), 'wb') as handle: pickle.dump(cpt_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(args.save, 'diag_vocab.pkl'), 'wb') as handle: pickle.dump(diag_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(args.save, 'med_vocab.pkl'), 'wb') as handle: pickle.dump(med_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(args.save, 'proc_vocab.pkl'), 'wb') as handle: pickle.dump(proc_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
def main(locator, weather_path, scenario, parameter_set, time_start, time_end, time_step_ts, set_temperature_goal, constant_temperature): # Preliminary step - time date_and_time_prediction = pd.date_range( start=time_start, end=time_end, freq=pd.to_timedelta(time_step_ts)) time_step = date_and_time_prediction[1] - date_and_time_prediction[0] time_end_object = datetime.datetime.strptime(time_end, '%Y-%m-%d %H:%M:%S') last_step_plus_1 = time_end_object + time_step last_step_plus_1_str = datetime.datetime.strftime(last_step_plus_1, '%Y-%m-%d %H:%M:%S') date_and_time_prediction_plus_1 = pd.date_range( start=time_start, end=last_step_plus_1_str, freq=pd.to_timedelta(time_step_ts)) # Getting and writting general data (internal_loads_df, indoor_comfort_df, construction_envelope_systems_df, leakage_envelope_systems_df, window_envelope_systems_df, roofs_envelope_systems_df, wall_envelope_systems_df, shading_envelope_systems_df, emission_systems_heating_df, emission_systems_cooling_df, emission_systems_controller_df, system_controls_ini_df, cooling_generation_df, zone_occupancy_df, zone_df, architecture_df, technical_systems_df, supply_systems_df, weather_general_info, weather_timeseries_initial_df, occupancy_types_full, occupancy_types, buildings_names, building_geometry_all, occupancy_types_full_cardinal, buildings_cardinal, occupancy_types_cardinal, occupants_probability_dic, lighting_appliances_probability_dic, processes_probability_dic, monthly_use_probability_df, occupancy_density_m2_p, footprint, gross_floor_area_m2, floors_cardinal_df, total_gross_floor_area_m2, mean_floor_height_m, system_controls_df, supply_temperature_df, emissions_cooling_type_dic, emissions_controller_type_dic, generation_cooling_code_dic, occupancy_per_building_cardinal, occupancy_per_building_list, T_int_cea_dic, T_ext_cea_df) = building_extract_cea_data.main(locator, weather_path, time_start, time_end) (date_and_time, year, wet_bulb_temperature_df, occupancy_probability_df) = building_write_definitions.main( locator, scenario, date_and_time_prediction, time_start, time_end, time_step, parameter_set, internal_loads_df, construction_envelope_systems_df, leakage_envelope_systems_df, window_envelope_systems_df, roofs_envelope_systems_df, wall_envelope_systems_df, shading_envelope_systems_df, zone_occupancy_df, architecture_df, weather_general_info, weather_timeseries_initial_df, occupancy_types, occupancy_types_cardinal, buildings_names, building_geometry_all, occupants_probability_dic, lighting_appliances_probability_dic, processes_probability_dic, monthly_use_probability_df, occupancy_density_m2_p, gross_floor_area_m2, mean_floor_height_m, DELTA_P_DIM, HE_E, H_I, DENSITY_AIR, HEAT_CAPACITY_AIR, supply_temperature_df, emissions_cooling_type_dic) (prediction_horizon, center_interval_temperatures_dic, set_setback_temperatures_dic, setback_boolean_dic, heating_boolean, cooling_boolean, set_temperatures_dic) = building_setup_district.main( date_and_time_prediction, time_step, set_temperature_goal, constant_temperature, buildings_names, system_controls_df, occupancy_per_building_cardinal, occupancy_per_building_list, occupancy_probability_df, indoor_comfort_df, T_int_cea_dic) electricity_prices_MWh = pd.read_excel(locator.get_electricity_costs(), "ELECTRICITY") electricity_prices_MWh[ "PRICE ($/MWh)"] = electricity_prices_MWh["cost_kWh"] * 1000 electricity_prices_MWh["our_datetime"] = pd.date_range( start='1/1/2005', periods=HOURS_IN_YEAR) electricity_prices_MWh.set_index('our_datetime', inplace=True) ( Qcsmax_Wm2_dic, em_efficiency_mean_dic, ) = building_process_hvac_efficiencies.main( locator, buildings_names, footprint, buildings_cardinal, cooling_generation_df, emission_systems_cooling_df, emission_systems_controller_df, generation_cooling_code_dic, emissions_cooling_type_dic, emissions_controller_type_dic, set_temperatures_dic, T_ext_cea_df, wet_bulb_temperature_df, prediction_horizon, date_and_time_prediction, occupancy_per_building_cardinal, occupancy_per_building_list, supply_temperature_df, PHI_5_MAX, FB, HP_ETA_EX_COOL, HP_AUXRATIO) return (prediction_horizon, date_and_time_prediction, date_and_time_prediction_plus_1, time_step, year, buildings_names, buildings_cardinal, center_interval_temperatures_dic, set_setback_temperatures_dic, setback_boolean_dic, heating_boolean, cooling_boolean, set_temperatures_dic, occupancy_per_building_cardinal, occupancy_per_building_list, gross_floor_area_m2, total_gross_floor_area_m2, indoor_comfort_df, occupancy_density_m2_p, occupancy_probability_df, em_efficiency_mean_dic, Qcsmax_Wm2_dic, electricity_prices_MWh)
def interpolate_weather_file(weather_file_path, weather_data_type, datetime_start, datetime_end, interpolation_freq, remove_leapyear): """Interpolate the data from a weather file to a new frequency.""" debug_plotting = False # Show a plot to check the interpolation result # debug_plotting = True # Show a plot to check the interpolation result # plot_value = 'IBEAM_H' # plot_value = 'IDIFF_H' plot_value = 'TAMB' # plot_value = 'WSPEED' # plot_value = 'RHUM' # plot_value = 'WDIR' # plot_value = 'CCOVER' # plot_value = 'PAMB' weather_file = os.path.basename(weather_file_path) # Read the file and store it in a DataFrame if weather_data_type == 'IGS' or weather_data_type == 'TRNSYS': weather_data = read_IGS_weather_file(weather_file_path) elif weather_data_type == 'DWD': weather_data = read_DWD_weather_file(weather_file_path) else: logger.error('Weather data type "'+weather_data_type+'" unknown!') exit() # Assumption: The IGS weather files always start at January 01. current_year = datetime_start.year newyear = datetime.datetime(current_year, 1, 1) # Convert hours of year to DateTime and make that the index of DataFrame weather_data.index = pd.to_timedelta(weather_data['HOUR'], unit='h') + newyear # Infer the time frequency of the original data original_freq = pd.infer_freq(weather_data.index, warn=True) original_freq = pd.to_timedelta(1, unit=original_freq) # logger.debug('Inferred freqency = '+str(original_freq)) if debug_plotting is True: # Plot the original data (Ambient temperature) fig = plt.figure() fig.suptitle(weather_file) weather_data[plot_value].plot(marker='.', label=plot_value+' orig') if interpolation_freq != original_freq: # Perform interpolation to new index of hours # Definition: # "Column value is a mean value related to the time interval delta t # ending at the time corresponding to actual weather_data line." # Thus during interpolation, a value must move to the middle of the # previous timestep # If the new frequency is larger (i.e. we are downsampling the data), # we need to use 'resample' to take the mean of the time intervals we # combine if interpolation_freq > original_freq: weather_data = weather_data.resample(interpolation_freq, label='right', closed='right').mean() # Now we can do the interpolation (upsampling). If we downsampled # before, this now only affects the start and end of the data # Create a shifted index to interpolate to interpolate_index = pd.date_range( start=datetime_start + pd.Timedelta(original_freq)/2 # shift + pd.Timedelta(interpolation_freq), # prevent "0 h" time stamp end=datetime_end + pd.Timedelta(original_freq)/2, # shift freq=interpolation_freq) weather_data = weather_data.reindex(interpolate_index) if interpolation_freq < original_freq: # Shift the correct number of steps to set a value to the middle # of the time step weather_data = weather_data.shift( freq=-pd.Timedelta(original_freq)/2) weather_data = weather_data.interpolate(method='time') # The interpolation will generate NaN on the lines before the first # original line (hours = 1). Fill those NaN 'backwards' with the last # valid values: weather_data.fillna(method='backfill', inplace=True) # Cloud cover is given in integers, so interpolated values need to be # rounded weather_data['CCOVER'] = weather_data['CCOVER'].round(decimals=0) # Convert DateTime index to hours of the year weather_data['HOUR'] = (weather_data.index - datetime_start) / \ np.timedelta64(1, 'h') if debug_plotting is True: # Plot the interpolated data weather_data[plot_value].plot(marker='x', label=plot_value+' intpl.') else: # No interpolation required. But we need to slice from start to end weather_data = weather_data[datetime_start:datetime_end] # Remove leapyear from DataFrame (optional) if calendar.isleap(current_year) is True: logger.warn(str(current_year)+' is a leap year. Be careful!') if remove_leapyear is True: weather_data = weather_data[~((weather_data.index.month == 2) & (weather_data.index.day == 29))] # Now show the plots, including their legend if debug_plotting is True: plt.legend() plt.show(block=False) return weather_data
train1_mergeDataset_add = pd.merge(train1_mergeDataset, train1_dayofweek_dataset, on=['user_id', 'day_of_week'], how='left') #train1_mergeDataset_add = train1_mergeDataset #train1_mergeDataset_add.drop(['DOW_power_sum', 'DOW_allsum', 'power_sum', 'DOW_power_mean', 'DOW_power_std', 'DOW_powaer_rate', 'power_mean', 'power_std', 'power_rate'], axis=1,inplace=True) #train1_mergeDataset_add.drop(['DOW_power_sum', 'DOW_allsum', 'power_sum', 'DOW_power_mean', 'DOW_powaer_rate', 'power_mean', 'power_rate'], axis=1,inplace=True) #train1_mergeDataset_add.drop(['DOW_power_sum', 'DOW_allsum', 'power_sum'], axis=1,inplace=True) train1_mergeDataset_add.drop([ 'DOW_power_sum', 'DOW_allsum', 'power_sum', 'DOW_power_rate', 'power_rate' ], axis=1, inplace=True) train1_Y = handledataset[ (handledataset.record_date >= (pd.to_datetime('2015-01-01') + pd.to_timedelta(7 * 81, unit='D'))) & (handledataset.record_date < (pd.to_datetime('2015-01-01') + pd.to_timedelta(7 * 82, unit='D')))] final_train1 = pd.merge(train1_mergeDataset_add, train1_Y, on=['user_id', 'day_of_week'], how='left') print "select train2 dataset ............." train2 = pd.read_csv( u'/home/haven/Tianchi_power/Wavelet_Handle(2)/F3_Result/train2AndPredictY.csv' ) train2_MeanStdSum = train2.groupby(['user_id'])['power_consumption'].agg({ 'power_mean': np.mean, 'power_std':
import matplotlib.dates as mdates pdf = PdfPages("sg_electricity.pdf") df = pd.read_csv("sg_electricity.csv") df["date"] = pd.to_datetime(df.date) # Clean up the hour variable and create a unified date variable def f(x): u = x.split(":") return int(u[0]) + int(u[1]) / 60 - 0.5 df["hourofday"] = df.period_ending_time.apply(f) df["date"] += pd.to_timedelta(df.hourofday, 'h') # Below are functions to extract different elements of the date # variable, in most cases using sin/cos to force continuous # periodicity. # Trend (non-periodic) def q0(x): return (x - pd.to_datetime("2012-01-01")).dt.days # Periodic cycle by year def f1(x): return np.cos(2 * np.pi * x.dt.dayofyear / 365)
def test_string_indexing(self): # GH 16896 df = pd.DataFrame({"x": range(3)}, index=pd.to_timedelta(range(3), unit="days")) expected = df.iloc[0] sliced = df.loc["0 days"] tm.assert_series_equal(sliced, expected)
def _assemble_from_unit_mappings(arg, errors): """ assemble the unit specifed fields from the arg (DataFrame) Return a Series for actual parsing Parameters ---------- arg : DataFrame errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaT - If 'ignore', then invalid parsing will return the input Returns ------- Series """ from pandas import to_timedelta, to_numeric, DataFrame arg = DataFrame(arg) if not arg.columns.is_unique: raise ValueError("cannot assemble with duplicate keys") # replace passed unit with _unit_map def f(value): if value in _unit_map: return _unit_map[value] # m is case significant if value.lower() in _unit_map: return _unit_map[value.lower()] return value unit = {k: f(k) for k in arg.keys()} unit_rev = {v: k for k, v in unit.items()} # we require at least Ymd required = ['year', 'month', 'day'] req = sorted(list(set(required) - set(unit_rev.keys()))) if len(req): raise ValueError("to assemble mappings requires at " "least that [year, month, day] be specified: " "[{0}] is missing".format(','.join(req))) # keys we don't recognize excess = sorted(list(set(unit_rev.keys()) - set(_unit_map.values()))) if len(excess): raise ValueError("extra keys have been passed " "to the datetime assemblage: " "[{0}]".format(','.join(excess))) def coerce(values): # we allow coercion to if errors allows values = to_numeric(values, errors=errors) # prevent overflow in case of int8 or int16 if is_integer_dtype(values): values = values.astype('int64', copy=False) return values values = (coerce(arg[unit_rev['year']]) * 10000 + coerce(arg[unit_rev['month']]) * 100 + coerce(arg[unit_rev['day']])) try: values = to_datetime(values, format='%Y%m%d', errors=errors) except (TypeError, ValueError) as e: raise ValueError("cannot assemble the " "datetimes: {0}".format(e)) for u in ['h', 'm', 's', 'ms', 'us', 'ns']: value = unit_rev.get(u) if value is not None and value in arg: try: values += to_timedelta(coerce(arg[value]), unit=u, errors=errors) except (TypeError, ValueError) as e: raise ValueError("cannot assemble the datetimes " "[{0}]: {1}".format(value, e)) return values
def test_construction(self): expected = np.timedelta64(10, 'D').astype('m8[ns]').view('i8') assert Timedelta(10, unit='d').value == expected assert Timedelta(10.0, unit='d').value == expected assert Timedelta('10 days').value == expected assert Timedelta(days=10).value == expected assert Timedelta(days=10.0).value == expected expected += np.timedelta64(10, 's').astype('m8[ns]').view('i8') assert Timedelta('10 days 00:00:10').value == expected assert Timedelta(days=10, seconds=10).value == expected assert Timedelta(days=10, milliseconds=10 * 1000).value == expected assert (Timedelta(days=10, microseconds=10 * 1000 * 1000).value == expected) # gh-8757: test construction with np dtypes timedelta_kwargs = { 'days': 'D', 'seconds': 's', 'microseconds': 'us', 'milliseconds': 'ms', 'minutes': 'm', 'hours': 'h', 'weeks': 'W' } npdtypes = [ np.int64, np.int32, np.int16, np.float64, np.float32, np.float16 ] for npdtype in npdtypes: for pykwarg, npkwarg in timedelta_kwargs.items(): expected = np.timedelta64(1, npkwarg).astype('m8[ns]').view('i8') assert Timedelta(**{pykwarg: npdtype(1)}).value == expected # rounding cases assert Timedelta(82739999850000).value == 82739999850000 assert ('0 days 22:58:59.999850' in str(Timedelta(82739999850000))) assert Timedelta(123072001000000).value == 123072001000000 assert ('1 days 10:11:12.001' in str(Timedelta(123072001000000))) # string conversion with/without leading zero # GH 9570 assert Timedelta('0:00:00') == timedelta(hours=0) assert Timedelta('00:00:00') == timedelta(hours=0) assert Timedelta('-1:00:00') == -timedelta(hours=1) assert Timedelta('-01:00:00') == -timedelta(hours=1) # more strings & abbrevs # GH 8190 assert Timedelta('1 h') == timedelta(hours=1) assert Timedelta('1 hour') == timedelta(hours=1) assert Timedelta('1 hr') == timedelta(hours=1) assert Timedelta('1 hours') == timedelta(hours=1) assert Timedelta('-1 hours') == -timedelta(hours=1) assert Timedelta('1 m') == timedelta(minutes=1) assert Timedelta('1.5 m') == timedelta(seconds=90) assert Timedelta('1 minute') == timedelta(minutes=1) assert Timedelta('1 minutes') == timedelta(minutes=1) assert Timedelta('1 s') == timedelta(seconds=1) assert Timedelta('1 second') == timedelta(seconds=1) assert Timedelta('1 seconds') == timedelta(seconds=1) assert Timedelta('1 ms') == timedelta(milliseconds=1) assert Timedelta('1 milli') == timedelta(milliseconds=1) assert Timedelta('1 millisecond') == timedelta(milliseconds=1) assert Timedelta('1 us') == timedelta(microseconds=1) assert Timedelta('1 micros') == timedelta(microseconds=1) assert Timedelta('1 microsecond') == timedelta(microseconds=1) assert Timedelta('1.5 microsecond') == Timedelta('00:00:00.000001500') assert Timedelta('1 ns') == Timedelta('00:00:00.000000001') assert Timedelta('1 nano') == Timedelta('00:00:00.000000001') assert Timedelta('1 nanosecond') == Timedelta('00:00:00.000000001') # combos assert Timedelta('10 days 1 hour') == timedelta(days=10, hours=1) assert Timedelta('10 days 1 h') == timedelta(days=10, hours=1) assert Timedelta('10 days 1 h 1m 1s') == timedelta(days=10, hours=1, minutes=1, seconds=1) assert Timedelta('-10 days 1 h 1m 1s') == -timedelta( days=10, hours=1, minutes=1, seconds=1) assert Timedelta('-10 days 1 h 1m 1s') == -timedelta( days=10, hours=1, minutes=1, seconds=1) assert Timedelta('-10 days 1 h 1m 1s 3us') == -timedelta( days=10, hours=1, minutes=1, seconds=1, microseconds=3) assert Timedelta('-10 days 1 h 1.5m 1s 3us'), -timedelta( days=10, hours=1, minutes=1, seconds=31, microseconds=3) # Currently invalid as it has a - on the hh:mm:dd part # (only allowed on the days) pytest.raises(ValueError, lambda: Timedelta('-10 days -1 h 1.5m 1s 3us')) # only leading neg signs are allowed pytest.raises(ValueError, lambda: Timedelta('10 days -1 h 1.5m 1s 3us')) # no units specified pytest.raises(ValueError, lambda: Timedelta('3.1415')) # invalid construction tm.assert_raises_regex(ValueError, "cannot construct a Timedelta", lambda: Timedelta()) tm.assert_raises_regex(ValueError, "unit abbreviation w/o a number", lambda: Timedelta('foo')) tm.assert_raises_regex( ValueError, "cannot construct a Timedelta from the " "passed arguments, allowed keywords are ", lambda: Timedelta(day=10)) # round-trip both for string and value for v in [ '1s', '-1s', '1us', '-1us', '1 day', '-1 day', '-23:59:59.999999', '-1 days +23:59:59.999999', '-1ns', '1ns', '-23:59:59.999999999' ]: td = Timedelta(v) assert Timedelta(td.value) == td # str does not normally display nanos if not td.nanoseconds: assert Timedelta(str(td)) == td assert Timedelta(td._repr_base(format='all')) == td # floats expected = np.timedelta64( 10, 's').astype('m8[ns]').view('i8') + np.timedelta64( 500, 'ms').astype('m8[ns]').view('i8') assert Timedelta(10.5, unit='s').value == expected # offset assert (to_timedelta( pd.offsets.Hour(2)) == Timedelta('0 days, 02:00:00')) assert (Timedelta(pd.offsets.Hour(2)) == Timedelta('0 days, 02:00:00')) assert (Timedelta( pd.offsets.Second(2)) == Timedelta('0 days, 00:00:02')) # gh-11995: unicode expected = Timedelta('1H') result = pd.Timedelta(u'1H') assert result == expected assert (to_timedelta( pd.offsets.Hour(2)) == Timedelta(u'0 days, 02:00:00')) pytest.raises(ValueError, lambda: Timedelta(u'foo bar'))
def setup_interface_daily(): b_d = "temp_daily" nam_file = "freyberg.nam" m = flopy.modflow.Modflow.load(nam_file, model_ws=b_d, check=False, forgive=False) # assign the executable name for the model m.exe_name = "mfnwt" # now let's run this in a new folder called temp so we don't overwrite the original data m.change_model_ws("temp", reset_external=True) # this writes all the MODFLOW files in the new location m.write_input() # the following helps get the dependecies (both python and executables) in the right place prep_deps.prep_template(t_d="temp") pyemu.os_utils.run("{0} {1}".format(m.exe_name, m.name + ".nam"), cwd=m.model_ws) props = [] paks = [ "upw.hk", "upw.vka", "upw.ss", "upw.sy", "bas6.strt", "extra.prsity" ] #"extra" because not a modflow parameter for k in range(m.nlay): props.extend([[p, k] for p in paks]) const_props = props.copy() props.append(["rch.rech", None]) for kper in range(m.nper): const_props.append(["rch.rech", kper]) spatial_list_props = [ ["wel.flux", 2], ["ghb.cond", 0], ["ghb.cond", 1], ["ghb.cond", 2] ] # spatially by each list entry, across all stress periods temporal_list_props = [["wel.flux", kper] for kper in range(m.nper) ] # spatially uniform for each stress period spatial_list_props, temporal_list_props dry_kper = int(m.nper * 0.85) hds_kperk = [[kper, k] for k in range(m.nlay) for kper in [0, dry_kper, m.nper - 1]] hds_kperk sfr_obs_dict = {} sfr_obs_dict["hw"] = np.arange(1, int(m.nrow / 2)) sfr_obs_dict["tw"] = np.arange(int(m.nrow / 2), m.nrow) sfr_obs_dict["gage_1"] = [39] pst_helper = pyemu.helpers.PstFromFlopyModel( nam_file, new_model_ws=t_d, org_model_ws="temp", const_props=const_props, spatial_list_props=spatial_list_props, temporal_list_props=temporal_list_props, remove_existing=True, grid_props=props, pp_props=props, sfr_pars=["strk"], hds_kperk=hds_kperk, sfr_obs=sfr_obs_dict, build_prior=False, model_exe_name="mfnwt", pp_space=4) prep_deps.prep_template(t_d=pst_helper.new_model_ws) pst = pst_helper.pst # check out hydraulic conductivity parameters pst.parameter_data.loc[ pst.parameter_data.parnme.apply(lambda x: "hk" in x), :] # what about observations? in particular, the sfr flow-out observations? pst.observation_data.loc[ pst.observation_data.obgnme.apply(lambda x: "flout" in x), :] obs = pst.observation_data flout_obs = obs.loc[obs.obgnme.apply(lambda x: "flout" in x), "obsnme"] obs.loc[flout_obs, "obgnme"] = flout_obs.apply(lambda x: "_".join(x.split('_')[:-1])) obs_locs = pd.read_csv( os.path.join("..", "base_model_files", "obs_loc.csv")) #build obs names that correspond to the obsnme values in the control file obs_locs.loc[:, "site"] = obs_locs.apply( lambda x: "trgw_{0:03d}_{1:03d}".format(x.row - 1, x.col - 1), axis=1) kij_dict = { site: (2, r - 1, c - 1) for site, r, c in zip(obs_locs.site, obs_locs.row, obs_locs.col) } binary_file = os.path.join(pst_helper.m.model_ws, nam_file.replace(".nam", ".hds")) frun_line, tr_hds_df = pyemu.gw_utils.setup_hds_timeseries( binary_file, kij_dict=kij_dict, include_path=True, model=pst_helper.m) pst_helper.frun_post_lines.append(frun_line) tr_hds_df.head() [f for f in os.listdir(pst_helper.m.model_ws) if f.endswith(".ins")] df = pst_helper.pst.add_observations(os.path.join( pst_helper.m.model_ws, nam_file.replace(".nam", ".hds_timeseries.processed.ins")), pst_path=".") obs = pst_helper.pst.observation_data obs.loc[df.index, "obgnme"] = df.index.map(lambda x: "_".join(x.split("_")[:-1])) obs.loc[df.index, "weight"] = 1.0 mp_files = [f for f in os.listdir(b_d) if "mp" in f or "location" in f] [ shutil.copy2(os.path.join(b_d, f), os.path.join(pst_helper.new_model_ws, f)) for f in mp_files ] pst_helper.frun_post_lines.append( "pyemu.os_utils.run('mp6 freyberg.mpsim >mp6.stdout')") pst_helper.tmp_files.append( "freyberg.mpenpt") # placed at top of `forward_run.py` pst_helper.write_forward_run() out_file = "freyberg.mpenpt" ins_file = out_file + ".ins" with open(os.path.join(pst_helper.new_model_ws, ins_file), 'w') as f: f.write("pif ~\n") f.write("l7 w w w !part_status! w w !part_time!\n") df = pst_helper.pst.add_observations(os.path.join(pst_helper.new_model_ws, ins_file), os.path.join(pst_helper.new_model_ws, out_file), pst_path=".") for k in range(m.nlay): np.savetxt(os.path.join(pst_helper.new_model_ws, "arr_org", "prsity_layer_{0}.ref".format(k + 1)), np.zeros((m.nrow, m.ncol)) + 0.001, fmt="%15.6E") par = pst.parameter_data tag_dict = { "hk": [0.1, 10.0], "vka": [0.1, 10], "strt": [0.95, 1.05], "pr": [0.8, 1.2], "rech": [0.8, 1.2] } for t, [l, u] in tag_dict.items(): t_pars = par.loc[par.parnme.apply(lambda x: t in x), "parnme"] par.loc[t_pars, "parubnd"] = u par.loc[t_pars, "parlbnd"] = l arr_csv = os.path.join(pst_helper.new_model_ws, "arr_pars.csv") df = pd.read_csv(arr_csv, index_col=0) sy_pr = df.model_file.apply(lambda x: "sy" in x or "pr" in x) df.loc[:, "upper_bound"] = np.NaN df.loc[sy_pr, "upper_bound"] = 0.4 df.to_csv(arr_csv) pst.control_data.noptmax = 0 pst.write(os.path.join(pst_helper.new_model_ws, "freyberg.pst")) pyemu.os_utils.run("pestpp-ies freyberg.pst", cwd=pst_helper.new_model_ws) pst = pyemu.Pst(os.path.join(pst_helper.m.model_ws, "freyberg.pst")) pe = pst_helper.draw(100) pe.enforce() # always a good idea! pe.to_binary(os.path.join(pst_helper.new_model_ws, "prior.jcb")) pst_helper.pst.write( os.path.join(pst_helper.m.model_ws, nam_file.replace(".nam", ".pst"))) obs = pst_helper.pst.observation_data dts = pd.to_datetime(pst_helper.m.start_datetime) + pd.to_timedelta( np.cumsum(pst_helper.m.dis.perlen.array), unit='d') dts_str = list(dts.map(lambda x: x.strftime("%Y%m%d")).values) dry_kper = int(pst_helper.m.nper * 0.85) dry_dt = dts_str[dry_kper] print(dry_dt) swgw_forecasts = obs.loc[ obs.obsnme.apply(lambda x: "fa" in x and ("hw" in x or "tw" in x) and dry_dt in x), "obsnme"].tolist() hds_fore_name = "hds_00_{0:03d}_{1:03d}_{2:03d}".format( int(pst_helper.m.nrow / 3), int(pst_helper.m.ncol / 10), dry_kper) print(hds_fore_name) hds_forecasts = obs.loc[obs.obsnme.apply(lambda x: hds_fore_name in x), "obsnme"].tolist() forecasts = swgw_forecasts forecasts.extend(hds_forecasts) forecasts.append("part_time") forecasts.append("part_status") pst_helper.pst.pestpp_options["forecasts"] = forecasts pst_helper.pst.write( os.path.join(pst_helper.m.model_ws, nam_file.replace(".nam", ".pst"))) lst = flopy.utils.MfListBudget( os.path.join(pst_helper.m.model_ws, "freyberg.list"))
def create_index(self): return pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1)
import pandas as pd import numpy as np df = pd.read_csv('timbu.csv', parse_dates=True, skiprows=12, header=None, dtype=str) df df2 = pd.DataFrame() df2['temperatura_C'] = pd.to_numeric(df[3] + '.' + df[4]) df2['pressao_M'] = pd.to_numeric(df[5] + '.' + df[6]) df2.index = (pd.to_datetime(df[1], dayfirst=True) + pd.to_timedelta(df[2])).rename('datahora') print(df2) df3 = pd.read_csv('dados_20190525.csv', index_col='datahora', parse_dates=True) df3
def test_iso_conversion(self): # GH #21877 expected = Timedelta(1, unit="s") assert to_timedelta("P0DT0H0M1S") == expected
def __init__(self, index, grouper=None, obj=None, name=None, level=None, sort=True, observed=False, in_axis=False): self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) self.all_grouper = None self.index = index self.sort = sort self.obj = obj self.observed = observed self.in_axis = in_axis # right place for this? if isinstance(grouper, (Series, Index)) and name is None: self.name = grouper.name if isinstance(grouper, MultiIndex): self.grouper = grouper.values # we have a single grouper which may be a myriad of things, # some of which are dependent on the passing in level if level is not None: if not isinstance(level, int): if level not in index.names: raise AssertionError('Level {} not in index'.format(level)) level = index.names.index(level) if self.name is None: self.name = index.names[level] self.grouper, self._labels, self._group_index = \ index._get_grouper_for_level(self.grouper, level) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get labels elif isinstance(self.grouper, Grouper): # get the new grouper; we already have disambiguated # what key/level refer to exactly, don't need to # check again as we have by this point converted these # to an actual value (rather than a pd.Grouper) _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False) if self.name is None: self.name = grouper.result_index.name self.obj = self.grouper.obj self.grouper = grouper else: if self.grouper is None and self.name is not None: self.grouper = self.obj[self.name] elif isinstance(self.grouper, (list, tuple)): self.grouper = com.asarray_tuplesafe(self.grouper) # a passed Categorical elif is_categorical_dtype(self.grouper): from pandas.core.groupby.categorical import recode_for_groupby self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed) categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes self._labels = self.grouper.codes if observed: codes = algorithms.unique1d(self.grouper.codes) codes = codes[codes != -1] if sort or self.grouper.ordered: codes = np.sort(codes) else: codes = np.arange(len(categories)) self._group_index = CategoricalIndex( Categorical.from_codes(codes=codes, categories=categories, ordered=self.grouper.ordered)) # we are done if isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper # no level passed elif not isinstance(self.grouper, (Series, Index, ExtensionArray, np.ndarray)): if getattr(self.grouper, 'ndim', 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError( "Grouper for '{}' not 1-dimensional".format(t)) self.grouper = self.index.map(self.grouper) if not (hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index)): errmsg = ('Grouper result violates len(labels) == ' 'len(data)\nresult: %s' % pprint_thing(self.grouper)) self.grouper = None # Try for sanity raise AssertionError(errmsg) # if we have a date/time-like grouper, make sure that we have # Timestamps like if getattr(self.grouper, 'dtype', None) is not None: if is_datetime64_dtype(self.grouper): from pandas import to_datetime self.grouper = to_datetime(self.grouper) elif is_timedelta64_dtype(self.grouper): from pandas import to_timedelta self.grouper = to_timedelta(self.grouper)
def test_fields(self): def check(value): # that we are int/long like assert isinstance(value, (int, compat.long)) # compat to datetime.timedelta rng = to_timedelta('1 days, 10:11:12') assert rng.days == 1 assert rng.seconds == 10 * 3600 + 11 * 60 + 12 assert rng.microseconds == 0 assert rng.nanoseconds == 0 pytest.raises(AttributeError, lambda: rng.hours) pytest.raises(AttributeError, lambda: rng.minutes) pytest.raises(AttributeError, lambda: rng.milliseconds) # GH 10050 check(rng.days) check(rng.seconds) check(rng.microseconds) check(rng.nanoseconds) td = Timedelta('-1 days, 10:11:12') assert abs(td) == Timedelta('13:48:48') assert str(td) == "-1 days +10:11:12" assert -td == Timedelta('0 days 13:48:48') assert -Timedelta('-1 days, 10:11:12').value == 49728000000000 assert Timedelta('-1 days, 10:11:12').value == -49728000000000 rng = to_timedelta('-1 days, 10:11:12.100123456') assert rng.days == -1 assert rng.seconds == 10 * 3600 + 11 * 60 + 12 assert rng.microseconds == 100 * 1000 + 123 assert rng.nanoseconds == 456 pytest.raises(AttributeError, lambda: rng.hours) pytest.raises(AttributeError, lambda: rng.minutes) pytest.raises(AttributeError, lambda: rng.milliseconds) # components tup = pd.to_timedelta(-1, 'us').components assert tup.days == -1 assert tup.hours == 23 assert tup.minutes == 59 assert tup.seconds == 59 assert tup.milliseconds == 999 assert tup.microseconds == 999 assert tup.nanoseconds == 0 # GH 10050 check(tup.days) check(tup.hours) check(tup.minutes) check(tup.seconds) check(tup.milliseconds) check(tup.microseconds) check(tup.nanoseconds) tup = Timedelta('-1 days 1 us').components assert tup.days == -2 assert tup.hours == 23 assert tup.minutes == 59 assert tup.seconds == 59 assert tup.milliseconds == 999 assert tup.microseconds == 999 assert tup.nanoseconds == 0
def test_fields(self): def check(value): # that we are int assert isinstance(value, int) # compat to datetime.timedelta rng = to_timedelta("1 days, 10:11:12") assert rng.days == 1 assert rng.seconds == 10 * 3600 + 11 * 60 + 12 assert rng.microseconds == 0 assert rng.nanoseconds == 0 msg = "'Timedelta' object has no attribute '{}'" with pytest.raises(AttributeError, match=msg.format("hours")): rng.hours with pytest.raises(AttributeError, match=msg.format("minutes")): rng.minutes with pytest.raises(AttributeError, match=msg.format("milliseconds")): rng.milliseconds # GH 10050 check(rng.days) check(rng.seconds) check(rng.microseconds) check(rng.nanoseconds) td = Timedelta("-1 days, 10:11:12") assert abs(td) == Timedelta("13:48:48") assert str(td) == "-1 days +10:11:12" assert -td == Timedelta("0 days 13:48:48") assert -Timedelta("-1 days, 10:11:12").value == 49728000000000 assert Timedelta("-1 days, 10:11:12").value == -49728000000000 rng = to_timedelta("-1 days, 10:11:12.100123456") assert rng.days == -1 assert rng.seconds == 10 * 3600 + 11 * 60 + 12 assert rng.microseconds == 100 * 1000 + 123 assert rng.nanoseconds == 456 msg = "'Timedelta' object has no attribute '{}'" with pytest.raises(AttributeError, match=msg.format("hours")): rng.hours with pytest.raises(AttributeError, match=msg.format("minutes")): rng.minutes with pytest.raises(AttributeError, match=msg.format("milliseconds")): rng.milliseconds # components tup = to_timedelta(-1, "us").components assert tup.days == -1 assert tup.hours == 23 assert tup.minutes == 59 assert tup.seconds == 59 assert tup.milliseconds == 999 assert tup.microseconds == 999 assert tup.nanoseconds == 0 # GH 10050 check(tup.days) check(tup.hours) check(tup.minutes) check(tup.seconds) check(tup.milliseconds) check(tup.microseconds) check(tup.nanoseconds) tup = Timedelta("-1 days 1 us").components assert tup.days == -2 assert tup.hours == 23 assert tup.minutes == 59 assert tup.seconds == 59 assert tup.milliseconds == 999 assert tup.microseconds == 999 assert tup.nanoseconds == 0
# %% np.datetime("2015-07-04 12:00") # %% np.datetime64("2015-07-04 12:59:59.50", "ns") # %% date = pd.to_datetime("4th of July, 2015") date # %% date.strftime("%A") # %% date + pd.to_timedelta(np.arange(12), "D") # %% index = pd.DatetimeIndex( ["2014-07-04", "2014-08-04", "2015-07-04", "2015-08-04"]) data = pd.Series([0, 1, 2, 3], index=index) data # %% data["2014-07-04":"2015-07-04"] # %% data["2015"] # %% dates = pd.to_datetime([
def test_cf_timedelta_2d(): timedeltas = ['1D', '2D', '3D'] units = 'days' numbers = np.atleast_2d([1, 2, 3]) timedeltas = np.atleast_2d(pd.to_timedelta(timedeltas, box=False)) expected = timedeltas actual = coding.times.decode_cf_timedelta(numbers, units) assert_array_equal(expected, actual) assert expected.dtype == actual.dtype @pytest.mark.parametrize(['deltas', 'expected'], [(pd.to_timedelta(['1 day', '2 days']), 'days'), (pd.to_timedelta(['1h', '1 day 1 hour']), 'hours'), (pd.to_timedelta(['1m', '2m', np.nan]), 'minutes'), (pd.to_timedelta(['1m3s', '1m4s']), 'seconds')]) def test_infer_timedelta_units(deltas, expected): assert expected == coding.times.infer_timedelta_units(deltas) @pytest.mark.skipif(not has_cftime_or_netCDF4, reason='cftime not installed') @pytest.mark.parametrize( ['date_args', 'expected'], [((1, 2, 3, 4, 5, 6), '0001-02-03 04:05:06.000000'), ((10, 2, 3, 4, 5, 6), '0010-02-03 04:05:06.000000'), ((100, 2, 3, 4, 5, 6), '0100-02-03 04:05:06.000000'), ((1000, 2, 3, 4, 5, 6), '1000-02-03 04:05:06.000000')]) def test_format_cftime_datetime(date_args, expected):
def filterPrep(df, string, fltr, time): colNames = [ 'EVSE ID', 'Port Number', 'Port Type', 'Station Name', 'Plug In Event Id', 'City', 'Latitude', 'Longitude', 'User ID', 'Driver Postal Code', 'Start Date', 'End Date', 'Total Duration (hh:mm:ss)', 'Charging Time (hh:mm:ss)', 'Energy (kWh)', 'Ended By', 'Start SOC', 'End SOC' ] df = pd.DataFrame(df, index=np.arange(len(df)), columns=colNames) #filter for dfcf #df = df.loc[df['Port Type'] == 'DC Fast'] df['Start Date'] = pd.to_datetime(df['Start Date']) df['End Date'] = pd.to_datetime(df['End Date']) df['Total Duration (hh:mm:ss)'] = pd.to_timedelta( df['Total Duration (hh:mm:ss)']) df['Charging Time (hh:mm:ss)'] = pd.to_timedelta( df['Charging Time (hh:mm:ss)']) #filter by City if fltr: df = df[df['City'].str.contains(string)] print("Filter for: ", string) else: print("No Filter") #clean data df = df.loc[df['Energy (kWh)'] > 0] df = df.loc[~pd.isnull(df['End Date'])] yr = 2017 df = df.loc[(df['Start Date'] > datetime.date(yr, 12, 1)) & (df['Start Date'] < datetime.date(yr + 2, 12, 1))] #update data types df['Duration (h)'] = df['Total Duration (hh:mm:ss)'].apply( lambda x: x.seconds / 3600) #df['Duration (h)'] = df['Duration (h)'].apply(lambda x: round(x * 4) / 4) df['Charging (h)'] = df['Charging Time (hh:mm:ss)'].apply( lambda x: x.seconds / 3600) #df['Charging (h)'] = df['Charging (h)'].apply(lambda x: round(x * 4) / 4) df['NoCharge (h)'] = df['Duration (h)'] - df['Charging (h)'] df = df.loc[df['Duration (h)'] > 0] # Day of year 0 = Jan1 and day of year 365 = Dec31 df['DayofYr'] = df['Start Date'].apply(lambda x: x.dayofyear) # Monday is 0 and Sunday is 6 df['DayofWk'] = df['Start Date'].apply(lambda x: x.weekday()) # Filter for weekdays df = df.loc[df['DayofWk'] <= 4] #df['isWeekday'] = df['DayofWk'].apply(lambda x: 1 if x <=4 else 0) #df = df.loc[df['isWeekday'] == 1] df['Year'] = df['Start Date'].apply(lambda x: x.year) df['StartHr'] = df['Start Date'].apply(lambda x: x.hour + x.minute / 60) df['EndHr'] = df['End Date'].apply(lambda x: x.hour + x.minute / 60) if time == 'hour': df['StartHr'] = df['StartHr'].apply(lambda x: np.floor(x)) df['EndHr'] = df['EndHr'].apply(lambda x: np.floor(x)) elif time == '15min': df['StartHr'] = df['StartHr'].apply(lambda x: round(x * 4) / 4) df['EndHr'] = df['EndHr'].apply(lambda x: round(x * 4) / 4) elif time == '5min': df['StartHr'] = df['StartHr'].apply(lambda x: round(x * 4) / 12) df['EndHr'] = df['EndHr'].apply(lambda x: round(x * 4) / 12) df['AvgPwr'] = df['Energy (kWh)'] / df['Duration (h)'] df['Date'] = df['Start Date'].apply( lambda x: str(x.year) + '-' + str(x.month) + '-' + str(x.day)) #convert percent to float def p2f(s): if isinstance(s, str): x = s.strip('%') x = float(x) / 100 return x else: return s df['Start SOC'] = df['Start SOC'].apply(lambda x: p2f(x)) df['End SOC'] = df['End SOC'].apply(lambda x: p2f(x)) # Sort Dataframe df.sort_values(['Start Date'], inplace=True) df = df.reset_index(drop=True) # Assign Day Count df['dayCount'] = 0 days = list(df['Start Date'].apply( lambda x: str(x.year) + '-' + str(x.month) + '-' + str(x.day))) daysSet = sorted(set(days), key=days.index) c = 0 for d in daysSet: dateTest = [df['Date'] == d] trueIdx = list(dateTest[0][dateTest[0]].index) df.at[trueIdx, 'dayCount'] = c c += 1 return df
break_df = pd.merge(break_df, summary[['Login Time', 'Agent Name']]) break_df = pd.merge(break_df, agentlist[['Team Name (ID)', 'Agent Name']]) # Drop Code column break_df = break_df.drop(columns='Code') ''' Data wrangling and feature engineering ''' # Remove all teams other than HERE: break_df = break_df.where( break_df['Team Name (ID)'] == 'Here Navigation (205587)') break_df = break_df.dropna() # Change Login Time and Duration from str to time break_df['Login Time'] = pd.to_timedelta(break_df['Login Time']) break_df['Duration'] = pd.to_timedelta(break_df['Duration in Seconds'], unit='Seconds') # Add new column for percent of time logged in spent in break break_df['Percent'] = break_df['Duration']/break_df['Login Time'] # Create index for Web (Paper) Agents and drop them from dataframe paper_agents_index = break_df[break_df['Agent Name'].str.contains ('Paper')].index break_df.drop(paper_agents_index, inplace=True) # Clean agent names break_df['Agent Name'] = break_df['Agent Name'].str.replace( '(', '', regex=False).str.replace(')', '', regex=False).str.replace( '0', '', regex=False).str.replace('1', '', regex=False).str.replace(
def create_session_col( data: pd.DataFrame, user_identifier_cols: List[str], time_col: str, max_session_time_mins: int, max_event_separation_mins: int, ) -> pd.DataFrame: """ Create a "session_ind" column in the dataframe. In particular, the session_ind column will be incremented each time a new session starts. Parameters ---------- data: pd.DataFrame This dataframe should contain at least the following columns: - time stamp column - columns related to user name and/or computer name and/or ip address etc user_identifier_cols: List[str] Name of the columns which contain username and/or computer name and/or ip address etc. Each time the value of one of these columns changes, a new session will be started. time_col: str Name of the column which contains a time stamp. If this column is not already in datetime64[ns, UTC] format, it will be casted to it. max_session_time_mins: int The maximum length of a session in minutes. If a sequence of events for the same user_identifier_cols values exceeds this length, then a new session will be started. max_event_separation_mins: int The maximum length in minutes between two events in a session. If we have 2 events for the same user_identifier_cols values, and if those two events are more than `max_event_separation_mins` apart, then a new session will be started. Returns ------- pd.DataFrame with an additional "session_ind" column """ max_sep = pd.to_timedelta(max_event_separation_mins, unit="min") max_ses = pd.to_timedelta(max_session_time_mins, unit="min") df_with_sesind = data.copy() if not isinstance(df_with_sesind[time_col].dtype, DatetimeTZDtype): df_with_sesind[time_col] = pd.to_datetime(df_with_sesind[time_col]) final_cols = list(df_with_sesind.columns) + ["session_ind"] if len(df_with_sesind) == 0: df_with_sesind["session_ind"] = None return df_with_sesind # Sessionising will not work properly with nans. Temporarily replace nan values with dummy_str. for col in user_identifier_cols: df_with_sesind[col] = df_with_sesind[col].fillna("dummy_str") df_with_sesind = df_with_sesind.sort_values(user_identifier_cols + [time_col]).reset_index( drop=True) # initialise first row ses_ind = 0 df_with_sesind.loc[0, "time_diff"] = pd.to_timedelta(0) df_with_sesind.loc[0, "cml_time"] = pd.to_timedelta(0) df_with_sesind.loc[0, "session_ind"] = ses_ind for i in range(1, len(df_with_sesind)): cur = df_with_sesind.iloc[i] prev = df_with_sesind.iloc[i - 1] # if any of the user_identifier_cols values change, a new session should start new_flag = False for col in user_identifier_cols: if cur[col] != prev[col]: new_flag = True break dif = cur[time_col] - prev[time_col] cml = prev["cml_time"] + dif # if the max session length is exceeded or the max separation between events is exceeded, # a new session should start if dif > max_sep or cml > max_ses: new_flag = True if new_flag: df_with_sesind.loc[i, "time_diff"] = pd.to_timedelta(0) df_with_sesind.loc[i, "cml_time"] = pd.to_timedelta(0) ses_ind += 1 df_with_sesind.loc[i, "session_ind"] = ses_ind else: df_with_sesind.loc[i, "time_diff"] = dif df_with_sesind.loc[i, "cml_time"] = cml df_with_sesind.loc[i, "session_ind"] = ses_ind # replace dummy_str with nan values for col in user_identifier_cols: df_with_sesind[col] = df_with_sesind[col].replace("dummy_str", np.nan) return df_with_sesind[final_cols]
# read observations obs = ps.read_dino('data/B58C0698001_1.csv') # Create the time series model ml = ps.Model(obs) # read weather data knmi = ps.read.knmi.KnmiStation.fromfile( 'data/neerslaggeg_HEIBLOEM-L_967-2.txt') rain = ps.TimeSeries(knmi.data['RD'], settings='prec') evap = ps.read_knmi('data/etmgeg_380.txt', variables='EV24') if True: # also add 9 hours to the evaporation s = evap.series_original s.index = s.index + pd.to_timedelta(9, 'h') evap.series_original = s # Create stress sm = ps.StressModel2(stress=[rain, evap], rfunc=ps.Exponential, name='recharge') ml.add_stressmodel(sm) # set the time-offset of the model. This should be done automatically in the future. ml._set_time_offset() ## Solve ml.solve(freq='D') ml.plots.decomposition()
class TestIntervalIndex(Base): _holder = IntervalIndex def setup_method(self, method): self.index = IntervalIndex.from_arrays([0, 1], [1, 2]) self.index_with_nan = IntervalIndex.from_tuples([(0, 1), np.nan, (1, 2)]) self.indices = dict(intervalIndex=tm.makeIntervalIndex(10)) def create_index(self, closed='right'): return IntervalIndex.from_breaks(range(11), closed=closed) def create_index_with_nan(self, closed='right'): mask = [True, False] + [True] * 8 return IntervalIndex.from_arrays(np.where(mask, np.arange(10), np.nan), np.where(mask, np.arange(1, 11), np.nan), closed=closed) @pytest.mark.parametrize('data', [ Index([0, 1, 2, 3, 4]), Index(list('abcde')), date_range('2017-01-01', periods=5), date_range('2017-01-01', periods=5, tz='US/Eastern'), timedelta_range('1 day', periods=5) ]) def test_constructors(self, data, closed, name): left, right = data[:-1], data[1:] ivs = [Interval(l, r, closed=closed) for l, r in lzip(left, right)] expected = IntervalIndex._simple_new(left=left, right=right, closed=closed, name=name) # validate expected assert expected.closed == closed assert expected.name == name assert expected.dtype.subtype == data.dtype tm.assert_index_equal(expected.left, data[:-1]) tm.assert_index_equal(expected.right, data[1:]) # validated constructors result = IntervalIndex(ivs, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_intervals(ivs, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_breaks(data, closed=closed, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_arrays(left, right, closed=closed, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_tuples(lzip(left, right), closed=closed, name=name) tm.assert_index_equal(result, expected) result = Index(ivs, name=name) assert isinstance(result, IntervalIndex) tm.assert_index_equal(result, expected) # idempotent tm.assert_index_equal(Index(expected), expected) tm.assert_index_equal(IntervalIndex(expected), expected) result = IntervalIndex.from_intervals(expected) tm.assert_index_equal(result, expected) result = IntervalIndex.from_intervals(expected.values, name=expected.name) tm.assert_index_equal(result, expected) left, right = expected.left, expected.right result = IntervalIndex.from_arrays(left, right, closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_tuples(expected.to_tuples(), closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected) breaks = expected.left.tolist() + [expected.right[-1]] result = IntervalIndex.from_breaks(breaks, closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected) @pytest.mark.parametrize('data', [[np.nan], [np.nan] * 2, [np.nan] * 50]) def test_constructors_nan(self, closed, data): # GH 18421 expected_values = np.array(data, dtype=object) expected_idx = IntervalIndex(data, closed=closed) # validate the expected index assert expected_idx.closed == closed tm.assert_numpy_array_equal(expected_idx.values, expected_values) result = IntervalIndex.from_tuples(data, closed=closed) tm.assert_index_equal(result, expected_idx) tm.assert_numpy_array_equal(result.values, expected_values) result = IntervalIndex.from_breaks([np.nan] + data, closed=closed) tm.assert_index_equal(result, expected_idx) tm.assert_numpy_array_equal(result.values, expected_values) result = IntervalIndex.from_arrays(data, data, closed=closed) tm.assert_index_equal(result, expected_idx) tm.assert_numpy_array_equal(result.values, expected_values) if closed == 'right': # Can't specify closed for IntervalIndex.from_intervals result = IntervalIndex.from_intervals(data) tm.assert_index_equal(result, expected_idx) tm.assert_numpy_array_equal(result.values, expected_values) @pytest.mark.parametrize('data', [[], np.array([], dtype='int64'), np.array([], dtype='float64'), np.array([], dtype=object)]) def test_constructors_empty(self, data, closed): # GH 18421 expected_dtype = data.dtype if isinstance(data, np.ndarray) else object expected_values = np.array([], dtype=object) expected_index = IntervalIndex(data, closed=closed) # validate the expected index assert expected_index.empty assert expected_index.closed == closed assert expected_index.dtype.subtype == expected_dtype tm.assert_numpy_array_equal(expected_index.values, expected_values) result = IntervalIndex.from_tuples(data, closed=closed) tm.assert_index_equal(result, expected_index) tm.assert_numpy_array_equal(result.values, expected_values) result = IntervalIndex.from_breaks(data, closed=closed) tm.assert_index_equal(result, expected_index) tm.assert_numpy_array_equal(result.values, expected_values) result = IntervalIndex.from_arrays(data, data, closed=closed) tm.assert_index_equal(result, expected_index) tm.assert_numpy_array_equal(result.values, expected_values) if closed == 'right': # Can't specify closed for IntervalIndex.from_intervals result = IntervalIndex.from_intervals(data) tm.assert_index_equal(result, expected_index) tm.assert_numpy_array_equal(result.values, expected_values) def test_constructors_errors(self): # scalar msg = (r'IntervalIndex\(...\) must be called with a collection of ' 'some kind, 5 was passed') with tm.assert_raises_regex(TypeError, msg): IntervalIndex(5) # not an interval msg = ("type <(class|type) 'numpy.int64'> with value 0 " "is not an interval") with tm.assert_raises_regex(TypeError, msg): IntervalIndex([0, 1]) with tm.assert_raises_regex(TypeError, msg): IntervalIndex.from_intervals([0, 1]) # invalid closed msg = "invalid options for 'closed': invalid" with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_arrays([0, 1], [1, 2], closed='invalid') # mismatched closed within intervals msg = 'intervals must all be closed on the same side' with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_intervals( [Interval(0, 1), Interval(1, 2, closed='left')]) with tm.assert_raises_regex(ValueError, msg): IntervalIndex([Interval(0, 1), Interval(2, 3, closed='left')]) with tm.assert_raises_regex(ValueError, msg): Index([Interval(0, 1), Interval(2, 3, closed='left')]) # mismatched closed inferred from intervals vs constructor. msg = 'conflicting values for closed' with tm.assert_raises_regex(ValueError, msg): iv = [Interval(0, 1, closed='both'), Interval(1, 2, closed='both')] IntervalIndex(iv, closed='neither') # no point in nesting periods in an IntervalIndex msg = 'Period dtypes are not supported, use a PeriodIndex instead' with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_breaks(pd.period_range('2000-01-01', periods=3)) # decreasing breaks/arrays msg = 'left side of interval must be <= right side' with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_breaks(range(10, -1, -1)) with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_arrays(range(10, -1, -1), range(9, -2, -1)) @pytest.mark.parametrize('tz_left, tz_right', [(None, 'UTC'), ('UTC', None), ('UTC', 'US/Eastern')]) def test_constructors_errors_tz(self, tz_left, tz_right): # GH 18537 left = date_range('2017-01-01', periods=4, tz=tz_left) right = date_range('2017-01-02', periods=4, tz=tz_right) # don't need to check IntervalIndex(...) or from_intervals, since # mixed tz are disallowed at the Interval level with pytest.raises(ValueError): IntervalIndex.from_arrays(left, right) with pytest.raises(ValueError): IntervalIndex.from_tuples(lzip(left, right)) with pytest.raises(ValueError): breaks = left.tolist() + [right[-1]] IntervalIndex.from_breaks(breaks) def test_properties(self, closed): index = self.create_index(closed=closed) assert len(index) == 10 assert index.size == 10 assert index.shape == (10, ) tm.assert_index_equal(index.left, Index(np.arange(10))) tm.assert_index_equal(index.right, Index(np.arange(1, 11))) tm.assert_index_equal(index.mid, Index(np.arange(0.5, 10.5))) assert index.closed == closed ivs = [Interval(l, r, closed) for l, r in zip(range(10), range(1, 11))] expected = np.array(ivs, dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) tm.assert_numpy_array_equal(index.values, expected) # with nans index = self.create_index_with_nan(closed=closed) assert len(index) == 10 assert index.size == 10 assert index.shape == (10, ) expected_left = Index([0, np.nan, 2, 3, 4, 5, 6, 7, 8, 9]) expected_right = expected_left + 1 expected_mid = expected_left + 0.5 tm.assert_index_equal(index.left, expected_left) tm.assert_index_equal(index.right, expected_right) tm.assert_index_equal(index.mid, expected_mid) assert index.closed == closed ivs = [ Interval(l, r, closed) if notna(l) else np.nan for l, r in zip(expected_left, expected_right) ] expected = np.array(ivs, dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) tm.assert_numpy_array_equal(index.values, expected) @pytest.mark.parametrize( 'breaks', [[1, 1, 2, 5, 15, 53, 217, 1014, 5335, 31240, 201608], [-np.inf, -100, -10, 0.5, 1, 1.5, 3.8, 101, 202, np.inf], pd.to_datetime(['20170101', '20170202', '20170303', '20170404']), pd.to_timedelta(['1ns', '2ms', '3s', '4M', '5H', '6D'])]) def test_length(self, closed, breaks): # GH 18789 index = IntervalIndex.from_breaks(breaks, closed=closed) result = index.length expected = Index(iv.length for iv in index) tm.assert_index_equal(result, expected) # with NA index = index.insert(1, np.nan) result = index.length expected = Index(iv.length if notna(iv) else iv for iv in index) tm.assert_index_equal(result, expected) @pytest.mark.parametrize('breaks', [ list('abcdefgh'), lzip(range(10), range(1, 11)), [['A', 'B'], ['a', 'b'], ['c', 'd'], ['e', 'f']], [Interval(0, 1), Interval(1, 2), Interval(3, 4), Interval(4, 5)] ]) def test_length_errors(self, closed, breaks): # GH 18789 index = IntervalIndex.from_breaks(breaks) msg = 'IntervalIndex contains Intervals without defined length' with tm.assert_raises_regex(TypeError, msg): index.length def test_with_nans(self, closed): index = self.create_index(closed=closed) assert not index.hasnans result = index.isna() expected = np.repeat(False, len(index)) tm.assert_numpy_array_equal(result, expected) result = index.notna() expected = np.repeat(True, len(index)) tm.assert_numpy_array_equal(result, expected) index = self.create_index_with_nan(closed=closed) assert index.hasnans result = index.isna() expected = np.array([False, True] + [False] * (len(index) - 2)) tm.assert_numpy_array_equal(result, expected) result = index.notna() expected = np.array([True, False] + [True] * (len(index) - 2)) tm.assert_numpy_array_equal(result, expected) def test_copy(self, closed): expected = self.create_index(closed=closed) result = expected.copy() assert result.equals(expected) result = expected.copy(deep=True) assert result.equals(expected) assert result.left is not expected.left def test_ensure_copied_data(self, closed): # exercise the copy flag in the constructor # not copying index = self.create_index(closed=closed) result = IntervalIndex(index, copy=False) tm.assert_numpy_array_equal(index.left.values, result.left.values, check_same='same') tm.assert_numpy_array_equal(index.right.values, result.right.values, check_same='same') # by-definition make a copy result = IntervalIndex.from_intervals(index.values, copy=False) tm.assert_numpy_array_equal(index.left.values, result.left.values, check_same='copy') tm.assert_numpy_array_equal(index.right.values, result.right.values, check_same='copy') def test_equals(self, closed): expected = IntervalIndex.from_breaks(np.arange(5), closed=closed) assert expected.equals(expected) assert expected.equals(expected.copy()) assert not expected.equals(expected.astype(object)) assert not expected.equals(np.array(expected)) assert not expected.equals(list(expected)) assert not expected.equals([1, 2]) assert not expected.equals(np.array([1, 2])) assert not expected.equals(pd.date_range('20130101', periods=2)) expected_name1 = IntervalIndex.from_breaks(np.arange(5), closed=closed, name='foo') expected_name2 = IntervalIndex.from_breaks(np.arange(5), closed=closed, name='bar') assert expected.equals(expected_name1) assert expected_name1.equals(expected_name2) for other_closed in {'left', 'right', 'both', 'neither'} - {closed}: expected_other_closed = IntervalIndex.from_breaks( np.arange(5), closed=other_closed) assert not expected.equals(expected_other_closed) def test_astype(self, closed): idx = self.create_index(closed=closed) result = idx.astype(object) tm.assert_index_equal(result, Index(idx.values, dtype='object')) assert not idx.equals(result) assert idx.equals(IntervalIndex.from_intervals(result)) result = idx.astype('interval') tm.assert_index_equal(result, idx) assert result.equals(idx) @pytest.mark.parametrize('dtype', [ np.int64, np.float64, 'period[M]', 'timedelta64', 'datetime64[ns]', 'datetime64[ns, US/Eastern]' ]) def test_astype_errors(self, closed, dtype): idx = self.create_index(closed=closed) msg = 'Cannot cast IntervalIndex to dtype' with tm.assert_raises_regex(TypeError, msg): idx.astype(dtype) @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series]) def test_where(self, closed, klass): idx = self.create_index(closed=closed) cond = [True] * len(idx) expected = idx result = expected.where(klass(cond)) tm.assert_index_equal(result, expected) cond = [False] + [True] * len(idx[1:]) expected = IntervalIndex([np.nan] + idx[1:].tolist()) result = idx.where(klass(cond)) tm.assert_index_equal(result, expected) def test_delete(self, closed): expected = IntervalIndex.from_breaks(np.arange(1, 11), closed=closed) result = self.create_index(closed=closed).delete(0) tm.assert_index_equal(result, expected) @pytest.mark.parametrize('data', [ interval_range(0, periods=10, closed='neither'), interval_range(1.7, periods=8, freq=2.5, closed='both'), interval_range(Timestamp('20170101'), periods=12, closed='left'), interval_range(Timedelta('1 day'), periods=6, closed='right'), IntervalIndex.from_tuples([('a', 'd'), ('e', 'j'), ('w', 'z')]), IntervalIndex.from_tuples([(1, 2), ('a', 'z'), (3.14, 6.28)]) ]) def test_insert(self, data): item = data[0] idx_item = IntervalIndex([item]) # start expected = idx_item.append(data) result = data.insert(0, item) tm.assert_index_equal(result, expected) # end expected = data.append(idx_item) result = data.insert(len(data), item) tm.assert_index_equal(result, expected) # mid expected = data[:3].append(idx_item).append(data[3:]) result = data.insert(3, item) tm.assert_index_equal(result, expected) # invalid type msg = 'can only insert Interval objects and NA into an IntervalIndex' with tm.assert_raises_regex(ValueError, msg): data.insert(1, 'foo') # invalid closed msg = 'inserted item must be closed on the same side as the index' for closed in {'left', 'right', 'both', 'neither'} - {item.closed}: with tm.assert_raises_regex(ValueError, msg): bad_item = Interval(item.left, item.right, closed=closed) data.insert(1, bad_item) # GH 18295 (test missing) na_idx = IntervalIndex([np.nan], closed=data.closed) for na in (np.nan, pd.NaT, None): expected = data[:1].append(na_idx).append(data[1:]) result = data.insert(1, na) tm.assert_index_equal(result, expected) def test_take(self, closed): index = self.create_index(closed=closed) result = index.take(range(10)) tm.assert_index_equal(result, index) result = index.take([0, 0, 1]) expected = IntervalIndex.from_arrays([0, 0, 1], [1, 1, 2], closed=closed) tm.assert_index_equal(result, expected) def test_unique(self, closed): # unique non-overlapping idx = IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)], closed=closed) assert idx.is_unique # unique overlapping - distinct endpoints idx = IntervalIndex.from_tuples([(0, 1), (0.5, 1.5)], closed=closed) assert idx.is_unique # unique overlapping - shared endpoints idx = pd.IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)], closed=closed) assert idx.is_unique # unique nested idx = IntervalIndex.from_tuples([(-1, 1), (-2, 2)], closed=closed) assert idx.is_unique # duplicate idx = IntervalIndex.from_tuples([(0, 1), (0, 1), (2, 3)], closed=closed) assert not idx.is_unique # unique mixed idx = IntervalIndex.from_tuples([(0, 1), ('a', 'b')], closed=closed) assert idx.is_unique # duplicate mixed idx = IntervalIndex.from_tuples([(0, 1), ('a', 'b'), (0, 1)], closed=closed) assert not idx.is_unique # empty idx = IntervalIndex([], closed=closed) assert idx.is_unique def test_monotonic(self, closed): # increasing non-overlapping idx = IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)], closed=closed) assert idx.is_monotonic assert idx._is_strictly_monotonic_increasing assert not idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # decreasing non-overlapping idx = IntervalIndex.from_tuples([(4, 5), (2, 3), (1, 2)], closed=closed) assert not idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert idx.is_monotonic_decreasing assert idx._is_strictly_monotonic_decreasing # unordered non-overlapping idx = IntervalIndex.from_tuples([(0, 1), (4, 5), (2, 3)], closed=closed) assert not idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert not idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # increasing overlapping idx = IntervalIndex.from_tuples([(0, 2), (0.5, 2.5), (1, 3)], closed=closed) assert idx.is_monotonic assert idx._is_strictly_monotonic_increasing assert not idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # decreasing overlapping idx = IntervalIndex.from_tuples([(1, 3), (0.5, 2.5), (0, 2)], closed=closed) assert not idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert idx.is_monotonic_decreasing assert idx._is_strictly_monotonic_decreasing # unordered overlapping idx = IntervalIndex.from_tuples([(0.5, 2.5), (0, 2), (1, 3)], closed=closed) assert not idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert not idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # increasing overlapping shared endpoints idx = pd.IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)], closed=closed) assert idx.is_monotonic assert idx._is_strictly_monotonic_increasing assert not idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # decreasing overlapping shared endpoints idx = pd.IntervalIndex.from_tuples([(2, 3), (1, 3), (1, 2)], closed=closed) assert not idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert idx.is_monotonic_decreasing assert idx._is_strictly_monotonic_decreasing # stationary idx = IntervalIndex.from_tuples([(0, 1), (0, 1)], closed=closed) assert idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # empty idx = IntervalIndex([], closed=closed) assert idx.is_monotonic assert idx._is_strictly_monotonic_increasing assert idx.is_monotonic_decreasing assert idx._is_strictly_monotonic_decreasing @pytest.mark.skip(reason='not a valid repr as we use interval notation') def test_repr(self): i = IntervalIndex.from_tuples([(0, 1), (1, 2)], closed='right') expected = ("IntervalIndex(left=[0, 1]," "\n right=[1, 2]," "\n closed='right'," "\n dtype='interval[int64]')") assert repr(i) == expected i = IntervalIndex.from_tuples( (Timestamp('20130101'), Timestamp('20130102')), (Timestamp('20130102'), Timestamp('20130103')), closed='right') expected = ("IntervalIndex(left=['2013-01-01', '2013-01-02']," "\n right=['2013-01-02', '2013-01-03']," "\n closed='right'," "\n dtype='interval[datetime64[ns]]')") assert repr(i) == expected @pytest.mark.skip(reason='not a valid repr as we use interval notation') def test_repr_max_seq_item_setting(self): super(TestIntervalIndex, self).test_repr_max_seq_item_setting() @pytest.mark.skip(reason='not a valid repr as we use interval notation') def test_repr_roundtrip(self): super(TestIntervalIndex, self).test_repr_roundtrip() # TODO: check this behavior is consistent with test_interval_new.py def test_get_item(self, closed): i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) assert i[0] == Interval(0.0, 1.0, closed=closed) assert i[1] == Interval(1.0, 2.0, closed=closed) assert isna(i[2]) result = i[0:1] expected = IntervalIndex.from_arrays((0., ), (1., ), closed=closed) tm.assert_index_equal(result, expected) result = i[0:2] expected = IntervalIndex.from_arrays((0., 1), (1., 2.), closed=closed) tm.assert_index_equal(result, expected) result = i[1:3] expected = IntervalIndex.from_arrays((1., np.nan), (2., np.nan), closed=closed) tm.assert_index_equal(result, expected) # To be removed, replaced by test_interval_new.py (see #16316, #16386) def test_get_loc_value(self): pytest.raises(KeyError, self.index.get_loc, 0) assert self.index.get_loc(0.5) == 0 assert self.index.get_loc(1) == 0 assert self.index.get_loc(1.5) == 1 assert self.index.get_loc(2) == 1 pytest.raises(KeyError, self.index.get_loc, -1) pytest.raises(KeyError, self.index.get_loc, 3) idx = IntervalIndex.from_tuples([(0, 2), (1, 3)]) assert idx.get_loc(0.5) == 0 assert idx.get_loc(1) == 0 tm.assert_numpy_array_equal(idx.get_loc(1.5), np.array([0, 1], dtype='int64')) tm.assert_numpy_array_equal(np.sort(idx.get_loc(2)), np.array([0, 1], dtype='int64')) assert idx.get_loc(3) == 1 pytest.raises(KeyError, idx.get_loc, 3.5) idx = IntervalIndex.from_arrays([0, 2], [1, 3]) pytest.raises(KeyError, idx.get_loc, 1.5) # To be removed, replaced by test_interval_new.py (see #16316, #16386) def slice_locs_cases(self, breaks): # TODO: same tests for more index types index = IntervalIndex.from_breaks([0, 1, 2], closed='right') assert index.slice_locs() == (0, 2) assert index.slice_locs(0, 1) == (0, 1) assert index.slice_locs(1, 1) == (0, 1) assert index.slice_locs(0, 2) == (0, 2) assert index.slice_locs(0.5, 1.5) == (0, 2) assert index.slice_locs(0, 0.5) == (0, 1) assert index.slice_locs(start=1) == (0, 2) assert index.slice_locs(start=1.2) == (1, 2) assert index.slice_locs(end=1) == (0, 1) assert index.slice_locs(end=1.1) == (0, 2) assert index.slice_locs(end=1.0) == (0, 1) assert index.slice_locs(-1, -1) == (0, 0) index = IntervalIndex.from_breaks([0, 1, 2], closed='neither') assert index.slice_locs(0, 1) == (0, 1) assert index.slice_locs(0, 2) == (0, 2) assert index.slice_locs(0.5, 1.5) == (0, 2) assert index.slice_locs(1, 1) == (1, 1) assert index.slice_locs(1, 2) == (1, 2) index = IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)], closed='both') assert index.slice_locs(1, 1) == (0, 1) assert index.slice_locs(1, 2) == (0, 2) # To be removed, replaced by test_interval_new.py (see #16316, #16386) def test_slice_locs_int64(self): self.slice_locs_cases([0, 1, 2]) # To be removed, replaced by test_interval_new.py (see #16316, #16386) def test_slice_locs_float64(self): self.slice_locs_cases([0.0, 1.0, 2.0]) # To be removed, replaced by test_interval_new.py (see #16316, #16386) def slice_locs_decreasing_cases(self, tuples): index = IntervalIndex.from_tuples(tuples) assert index.slice_locs(1.5, 0.5) == (1, 3) assert index.slice_locs(2, 0) == (1, 3) assert index.slice_locs(2, 1) == (1, 3) assert index.slice_locs(3, 1.1) == (0, 3) assert index.slice_locs(3, 3) == (0, 2) assert index.slice_locs(3.5, 3.3) == (0, 1) assert index.slice_locs(1, -3) == (2, 3) slice_locs = index.slice_locs(-1, -1) assert slice_locs[0] == slice_locs[1] # To be removed, replaced by test_interval_new.py (see #16316, #16386) def test_slice_locs_decreasing_int64(self): self.slice_locs_cases([(2, 4), (1, 3), (0, 2)]) # To be removed, replaced by test_interval_new.py (see #16316, #16386) def test_slice_locs_decreasing_float64(self): self.slice_locs_cases([(2., 4.), (1., 3.), (0., 2.)]) # To be removed, replaced by test_interval_new.py (see #16316, #16386) def test_slice_locs_fails(self): index = IntervalIndex.from_tuples([(1, 2), (0, 1), (2, 3)]) with pytest.raises(KeyError): index.slice_locs(1, 2) # To be removed, replaced by test_interval_new.py (see #16316, #16386) def test_get_loc_interval(self): assert self.index.get_loc(Interval(0, 1)) == 0 assert self.index.get_loc(Interval(0, 0.5)) == 0 assert self.index.get_loc(Interval(0, 1, 'left')) == 0 pytest.raises(KeyError, self.index.get_loc, Interval(2, 3)) pytest.raises(KeyError, self.index.get_loc, Interval(-1, 0, 'left')) # To be removed, replaced by test_interval_new.py (see #16316, #16386) def test_get_indexer(self): actual = self.index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) expected = np.array([-1, -1, 0, 0, 1, 1, -1], dtype='intp') tm.assert_numpy_array_equal(actual, expected) actual = self.index.get_indexer(self.index) expected = np.array([0, 1], dtype='intp') tm.assert_numpy_array_equal(actual, expected) index = IntervalIndex.from_breaks([0, 1, 2], closed='left') actual = index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) expected = np.array([-1, 0, 0, 1, 1, -1, -1], dtype='intp') tm.assert_numpy_array_equal(actual, expected) actual = self.index.get_indexer(index[:1]) expected = np.array([0], dtype='intp') tm.assert_numpy_array_equal(actual, expected) actual = self.index.get_indexer(index) expected = np.array([-1, 1], dtype='intp') tm.assert_numpy_array_equal(actual, expected) # To be removed, replaced by test_interval_new.py (see #16316, #16386) def test_get_indexer_subintervals(self): # TODO: is this right? # return indexers for wholly contained subintervals target = IntervalIndex.from_breaks(np.linspace(0, 2, 5)) actual = self.index.get_indexer(target) expected = np.array([0, 0, 1, 1], dtype='p') tm.assert_numpy_array_equal(actual, expected) target = IntervalIndex.from_breaks([0, 0.67, 1.33, 2]) actual = self.index.get_indexer(target) expected = np.array([0, 0, 1, 1], dtype='intp') tm.assert_numpy_array_equal(actual, expected) actual = self.index.get_indexer(target[[0, -1]]) expected = np.array([0, 1], dtype='intp') tm.assert_numpy_array_equal(actual, expected) target = IntervalIndex.from_breaks([0, 0.33, 0.67, 1], closed='left') actual = self.index.get_indexer(target) expected = np.array([0, 0, 0], dtype='intp') tm.assert_numpy_array_equal(actual, expected) # To be removed, replaced by test_interval_new.py (see #16316, #16386) def test_contains(self): # Only endpoints are valid. i = IntervalIndex.from_arrays([0, 1], [1, 2]) # Invalid assert 0 not in i assert 1 not in i assert 2 not in i # Valid assert Interval(0, 1) in i assert Interval(0, 2) in i assert Interval(0, 0.5) in i assert Interval(3, 5) not in i assert Interval(-1, 0, closed='left') not in i # To be removed, replaced by test_interval_new.py (see #16316, #16386) def testcontains(self): # can select values that are IN the range of a value i = IntervalIndex.from_arrays([0, 1], [1, 2]) assert i.contains(0.1) assert i.contains(0.5) assert i.contains(1) assert i.contains(Interval(0, 1)) assert i.contains(Interval(0, 2)) # these overlaps completely assert i.contains(Interval(0, 3)) assert i.contains(Interval(1, 3)) assert not i.contains(20) assert not i.contains(-20) def test_dropna(self, closed): expected = IntervalIndex.from_tuples([(0.0, 1.0), (1.0, 2.0)], closed=closed) ii = IntervalIndex.from_tuples([(0, 1), (1, 2), np.nan], closed=closed) result = ii.dropna() tm.assert_index_equal(result, expected) ii = IntervalIndex.from_arrays([0, 1, np.nan], [1, 2, np.nan], closed=closed) result = ii.dropna() tm.assert_index_equal(result, expected) # TODO: check this behavior is consistent with test_interval_new.py def test_non_contiguous(self, closed): index = IntervalIndex.from_tuples([(0, 1), (2, 3)], closed=closed) target = [0.5, 1.5, 2.5] actual = index.get_indexer(target) expected = np.array([0, -1, 1], dtype='intp') tm.assert_numpy_array_equal(actual, expected) assert 1.5 not in index def test_union(self, closed): index = self.create_index(closed=closed) other = IntervalIndex.from_breaks(range(5, 13), closed=closed) expected = IntervalIndex.from_breaks(range(13), closed=closed) result = index.union(other) tm.assert_index_equal(result, expected) result = other.union(index) tm.assert_index_equal(result, expected) tm.assert_index_equal(index.union(index), index) tm.assert_index_equal(index.union(index[:1]), index) def test_intersection(self, closed): index = self.create_index(closed=closed) other = IntervalIndex.from_breaks(range(5, 13), closed=closed) expected = IntervalIndex.from_breaks(range(5, 11), closed=closed) result = index.intersection(other) tm.assert_index_equal(result, expected) result = other.intersection(index) tm.assert_index_equal(result, expected) tm.assert_index_equal(index.intersection(index), index) def test_difference(self, closed): index = self.create_index(closed=closed) tm.assert_index_equal(index.difference(index[:1]), index[1:]) def test_symmetric_difference(self, closed): idx = self.create_index(closed=closed) result = idx[1:].symmetric_difference(idx[:-1]) expected = IntervalIndex([idx[0], idx[-1]]) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( 'op_name', ['union', 'intersection', 'difference', 'symmetric_difference']) def test_set_operation_errors(self, closed, op_name): index = self.create_index(closed=closed) set_op = getattr(index, op_name) # test errors msg = ('can only do set operations between two IntervalIndex objects ' 'that are closed on the same side') with tm.assert_raises_regex(ValueError, msg): set_op(Index([1, 2, 3])) for other_closed in {'right', 'left', 'both', 'neither'} - {closed}: other = self.create_index(closed=other_closed) with tm.assert_raises_regex(ValueError, msg): set_op(other) def test_isin(self, closed): index = self.create_index(closed=closed) expected = np.array([True] + [False] * (len(index) - 1)) result = index.isin(index[:1]) tm.assert_numpy_array_equal(result, expected) result = index.isin([index[0]]) tm.assert_numpy_array_equal(result, expected) other = IntervalIndex.from_breaks(np.arange(-2, 10), closed=closed) expected = np.array([True] * (len(index) - 1) + [False]) result = index.isin(other) tm.assert_numpy_array_equal(result, expected) result = index.isin(other.tolist()) tm.assert_numpy_array_equal(result, expected) for other_closed in {'right', 'left', 'both', 'neither'}: other = self.create_index(closed=other_closed) expected = np.repeat(closed == other_closed, len(index)) result = index.isin(other) tm.assert_numpy_array_equal(result, expected) result = index.isin(other.tolist()) tm.assert_numpy_array_equal(result, expected) def test_comparison(self): actual = Interval(0, 1) < self.index expected = np.array([False, True]) tm.assert_numpy_array_equal(actual, expected) actual = Interval(0.5, 1.5) < self.index expected = np.array([False, True]) tm.assert_numpy_array_equal(actual, expected) actual = self.index > Interval(0.5, 1.5) tm.assert_numpy_array_equal(actual, expected) actual = self.index == self.index expected = np.array([True, True]) tm.assert_numpy_array_equal(actual, expected) actual = self.index <= self.index tm.assert_numpy_array_equal(actual, expected) actual = self.index >= self.index tm.assert_numpy_array_equal(actual, expected) actual = self.index < self.index expected = np.array([False, False]) tm.assert_numpy_array_equal(actual, expected) actual = self.index > self.index tm.assert_numpy_array_equal(actual, expected) actual = self.index == IntervalIndex.from_breaks([0, 1, 2], 'left') tm.assert_numpy_array_equal(actual, expected) actual = self.index == self.index.values tm.assert_numpy_array_equal(actual, np.array([True, True])) actual = self.index.values == self.index tm.assert_numpy_array_equal(actual, np.array([True, True])) actual = self.index <= self.index.values tm.assert_numpy_array_equal(actual, np.array([True, True])) actual = self.index != self.index.values tm.assert_numpy_array_equal(actual, np.array([False, False])) actual = self.index > self.index.values tm.assert_numpy_array_equal(actual, np.array([False, False])) actual = self.index.values > self.index tm.assert_numpy_array_equal(actual, np.array([False, False])) # invalid comparisons actual = self.index == 0 tm.assert_numpy_array_equal(actual, np.array([False, False])) actual = self.index == self.index.left tm.assert_numpy_array_equal(actual, np.array([False, False])) with tm.assert_raises_regex(TypeError, 'unorderable types'): self.index > 0 with tm.assert_raises_regex(TypeError, 'unorderable types'): self.index <= 0 with pytest.raises(TypeError): self.index > np.arange(2) with pytest.raises(ValueError): self.index > np.arange(3) def test_missing_values(self, closed): idx = Index([ np.nan, Interval(0, 1, closed=closed), Interval(1, 2, closed=closed) ]) idx2 = IntervalIndex.from_arrays([np.nan, 0, 1], [np.nan, 1, 2], closed=closed) assert idx.equals(idx2) with pytest.raises(ValueError): IntervalIndex.from_arrays([np.nan, 0, 1], np.array([0, 1, 2]), closed=closed) tm.assert_numpy_array_equal(isna(idx), np.array([True, False, False])) def test_sort_values(self, closed): index = self.create_index(closed=closed) result = index.sort_values() tm.assert_index_equal(result, index) result = index.sort_values(ascending=False) tm.assert_index_equal(result, index[::-1]) # with nan index = IntervalIndex([Interval(1, 2), np.nan, Interval(0, 1)]) result = index.sort_values() expected = IntervalIndex([Interval(0, 1), Interval(1, 2), np.nan]) tm.assert_index_equal(result, expected) result = index.sort_values(ascending=False) expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)]) tm.assert_index_equal(result, expected) @pytest.mark.parametrize('tz', [None, 'US/Eastern']) def test_datetime(self, tz): start = Timestamp('2000-01-01', tz=tz) dates = date_range(start=start, periods=10) index = IntervalIndex.from_breaks(dates) # test mid start = Timestamp('2000-01-01T12:00', tz=tz) expected = date_range(start=start, periods=9) tm.assert_index_equal(index.mid, expected) # __contains__ doesn't check individual points assert Timestamp('2000-01-01', tz=tz) not in index assert Timestamp('2000-01-01T12', tz=tz) not in index assert Timestamp('2000-01-02', tz=tz) not in index iv_true = Interval(Timestamp('2000-01-01T08', tz=tz), Timestamp('2000-01-01T18', tz=tz)) iv_false = Interval(Timestamp('1999-12-31', tz=tz), Timestamp('2000-01-01', tz=tz)) assert iv_true in index assert iv_false not in index # .contains does check individual points assert not index.contains(Timestamp('2000-01-01', tz=tz)) assert index.contains(Timestamp('2000-01-01T12', tz=tz)) assert index.contains(Timestamp('2000-01-02', tz=tz)) assert index.contains(iv_true) assert not index.contains(iv_false) # test get_indexer start = Timestamp('1999-12-31T12:00', tz=tz) target = date_range(start=start, periods=7, freq='12H') actual = index.get_indexer(target) expected = np.array([-1, -1, 0, 0, 1, 1, 2], dtype='intp') tm.assert_numpy_array_equal(actual, expected) start = Timestamp('2000-01-08T18:00', tz=tz) target = date_range(start=start, periods=7, freq='6H') actual = index.get_indexer(target) expected = np.array([7, 7, 8, 8, 8, 8, -1], dtype='intp') tm.assert_numpy_array_equal(actual, expected) def test_append(self, closed): index1 = IntervalIndex.from_arrays([0, 1], [1, 2], closed=closed) index2 = IntervalIndex.from_arrays([1, 2], [2, 3], closed=closed) result = index1.append(index2) expected = IntervalIndex.from_arrays([0, 1, 1, 2], [1, 2, 2, 3], closed=closed) tm.assert_index_equal(result, expected) result = index1.append([index1, index2]) expected = IntervalIndex.from_arrays([0, 1, 0, 1, 1, 2], [1, 2, 1, 2, 2, 3], closed=closed) tm.assert_index_equal(result, expected) msg = ('can only append two IntervalIndex objects that are closed ' 'on the same side') for other_closed in {'left', 'right', 'both', 'neither'} - {closed}: index_other_closed = IntervalIndex.from_arrays([0, 1], [1, 2], closed=other_closed) with tm.assert_raises_regex(ValueError, msg): index1.append(index_other_closed) def test_is_non_overlapping_monotonic(self, closed): # Should be True in all cases tpls = [(0, 1), (2, 3), (4, 5), (6, 7)] idx = IntervalIndex.from_tuples(tpls, closed=closed) assert idx.is_non_overlapping_monotonic is True idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) assert idx.is_non_overlapping_monotonic is True # Should be False in all cases (overlapping) tpls = [(0, 2), (1, 3), (4, 5), (6, 7)] idx = IntervalIndex.from_tuples(tpls, closed=closed) assert idx.is_non_overlapping_monotonic is False idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) assert idx.is_non_overlapping_monotonic is False # Should be False in all cases (non-monotonic) tpls = [(0, 1), (2, 3), (6, 7), (4, 5)] idx = IntervalIndex.from_tuples(tpls, closed=closed) assert idx.is_non_overlapping_monotonic is False idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) assert idx.is_non_overlapping_monotonic is False # Should be False for closed='both', otherwise True (GH16560) if closed == 'both': idx = IntervalIndex.from_breaks(range(4), closed=closed) assert idx.is_non_overlapping_monotonic is False else: idx = IntervalIndex.from_breaks(range(4), closed=closed) assert idx.is_non_overlapping_monotonic is True @pytest.mark.parametrize('tuples', [ lzip(range(10), range(1, 11)), lzip(date_range('20170101', periods=10), date_range('20170101', periods=10)), lzip(timedelta_range('0 days', periods=10), timedelta_range('1 day', periods=10)) ]) def test_to_tuples(self, tuples): # GH 18756 idx = IntervalIndex.from_tuples(tuples) result = idx.to_tuples() expected = Index(_asarray_tuplesafe(tuples)) tm.assert_index_equal(result, expected) @pytest.mark.parametrize('tuples', [ lzip(range(10), range(1, 11)) + [np.nan], lzip(date_range('20170101', periods=10), date_range('20170101', periods=10)) + [np.nan], lzip(timedelta_range('0 days', periods=10), timedelta_range('1 day', periods=10)) + [np.nan] ]) @pytest.mark.parametrize('na_tuple', [True, False]) def test_to_tuples_na(self, tuples, na_tuple): # GH 18756 idx = IntervalIndex.from_tuples(tuples) result = idx.to_tuples(na_tuple=na_tuple) # check the non-NA portion expected_notna = Index(_asarray_tuplesafe(tuples[:-1])) result_notna = result[:-1] tm.assert_index_equal(result_notna, expected_notna) # check the NA portion result_na = result[-1] if na_tuple: assert isinstance(result_na, tuple) assert len(result_na) == 2 assert all(isna(x) for x in result_na) else: assert isna(result_na)
def prepare_timetable(line_car_timetable_path,station_code_path,time_table_path,out_put_name,out_put_path): line_car_timetable = pd.read_excel(line_car_timetable_path).dropna() station_code = pd.read_excel(station_code_path).dropna() out_put_name = out_put_path + out_put_name+'.csv' # Combine the timetable files for different lines count = 0 i = 1 # ------------------------NEW VERSION---------------- for ix, lt in line_car_timetable.iterrows(): file = time_table_path + '/' + lt[2] if count == 0: time_table = pd.read_excel(file, dtype={'Train_No': 'str', 'Trs_No': 'str', 'Trip_No': 'str'}) columns_list = list(time_table.columns) for name in columns_list: time_table = time_table.rename(columns = {name: name.replace(" ", "")}) #print (pd.unique(time_table['Car_Num'])) time_table[['Train_No', 'Trs_No', 'Trip_No']] = \ time_table[['Train_No', 'Trs_No', 'Trip_No']].astype('str') # print (time_table.columns) count += 1 else: new_table = pd.read_excel(file, dtype={'Train_No': 'str', 'Trs_No': 'str', 'Trip_No': 'str', 'Arr_From': 'str','Dep_From': 'str','Arr_To': 'str','Dep_To': 'str'}) # print(new_table.columns) columns_list = list(new_table.columns) for name in columns_list: new_table = new_table.rename(columns = {name: name.replace(" ", "")}) #print (pd.unique(new_table['Car_Num'])) new_table[['Train_No', 'Trs_No', 'Trip_No']] = \ new_table[['Train_No', 'Trs_No', 'Trip_No']].astype('str') try: time_table = pd.concat([time_table, new_table], sort=False) except: print(new_table.loc[:, ['Train_No', 'Trs_No', 'Trip_No']]) # -----------------OLD VERSION--------------- # with open(out_put_name, 'w', newline='') as f: # writer = csv.writer(f, delimiter=',') # for ix, lt in line_car_timetable.iterrows(): # df_table = pd.read_excel(file) # # if i: # writer.writerow(df_table.columns) # i = 0 # writer.writerows(df_table.values) # line code # time_table = pd.read_csv(out_put_name) time_list = ['Arr_From','Dep_From','Arr_To','Dep_To'] for time_name in time_list: time_table[time_name] = time_table[time_name].astype('str') time_table[time_name] = time_table[time_name].apply(lambda x: x.split(' ')[-1]) time_table[time_name] = pd.to_timedelta(time_table[time_name]) time_table.loc[time_table[time_name]>pd.Timedelta('1 days'),time_name] -= pd.Timedelta('1 days') time_table[time_name] = time_table[time_name].apply(format_timedelta) #time_table.to_csv('test.csv') time_table.drop(['Train_Trip', 'Train_KM'], axis=1, inplace=True) # print (time_table.columns) df_merged = time_table.merge(line_car_timetable, left_on='Line', right_on='LINE', how='left') df_merged['Car_Num'].fillna(df_merged['DEFALT_CARS'], inplace=True) # station code for from station df_merged = df_merged.merge(station_code, left_on=['From', 'Line'], right_on=['STATION', 'LINE'], how='left') df_merged['From_ID'] = df_merged['CODE'] df_merged.drop(['CODE'], axis=1, inplace=True) # station code for To station df_merged = df_merged.merge(station_code, left_on=['To', 'Line'], right_on=['STATION', 'LINE'], how='left') df_merged['To_ID'] = df_merged['CODE'] # direction code (down = 2, up =1) df_merged['Direction_ID'] = df_merged['Direction'].apply(lambda x: 1 if x == 'UP' else 2) # prepare the final outputs output = df_merged.loc[:,['Line', 'LINE_CODE', 'Train_No', 'Trs_No', 'Trip_No', 'Revenue_Y_N', 'Direction', 'Direction_ID', 'From', 'From_ID', 'Arr_From', 'Dep_From', 'To', 'To_ID', 'Arr_To', 'Dep_To', 'Car_Num']] # remove revenue is N output = output[output.Revenue_Y_N == 'Y'] output['Trip_No'] = 'T_' + output['Trip_No'] # make the trip no become purely str output = output.drop_duplicates() output.to_csv(out_put_name, index=False) return output