def test_timestamp_with_nulls(self): df = pd.DataFrame({'test': [pd.datetime(2016, 1, 1), None, pd.datetime(2016, 1, 3)]}) df['with_tz'] = df.test.dt.tz_localize('utc') self._check_pandas_roundtrip(df, null_counts=[1, 1])
def analyse_age_distribution(self, unique, counts): urllib.request.urlretrieve( 'https://www.dropbox.com/s/ze3chu5mvetjwv2/TagsControl2016.xlsx?dl=1', 'TagsControl2016.xlsx') age_data = pd.read_excel('TagsControl2016.xlsx') age_data.drop('Unnamed: 0', axis=1, inplace=True) age_data.Date = pd.to_datetime(age_data.Date) parity_indices = age_data.index[(age_data.Date >= pd.datetime(2016, 7, 25)) & (age_data.Date != pd.datetime(2016, 7, 26))] age_data.loc[parity_indices, 'From'] += 2048 age_data.loc[parity_indices, 'To'] += 2048 age_data['Age'] = [dt.days for dt in (pd.datetime.now() - age_data.Date)] age_by_idx = {} for index, row in age_data.iterrows(): if row.From.is_integer() and row.To.is_integer(): for idx in range(int(row.From), int(row.To)): age_by_idx[idx] = row.Age ages = [age_by_idx[u] for u, c in zip(unique, counts) if u in age_by_idx.keys()] self.plot_age_distribution(ages)
def _attach_files(self, files_info): """Attaches info returned by instrument list_files routine to Instrument object. """ if not files_info.empty: if (len(files_info.index.unique()) != len(files_info)): estr = 'WARNING! Duplicate datetimes in provided file ' estr = '{:s}information.\nKeeping one of each '.format(estr) estr = '{:s}of the duplicates, dropping the rest.'.format(estr) print(estr) print(files_info.index.get_duplicates()) idx = np.unique(files_info.index, return_index=True) files_info = files_info.ix[idx[1]] #raise ValueError('List of files must have unique datetimes.') self.files = files_info.sort_index() date = files_info.index[0] self.start_date = pds.datetime(date.year, date.month, date.day) date = files_info.index[-1] self.stop_date = pds.datetime(date.year, date.month, date.day) else: self.start_date = None self.stop_date = None # convert to object type # necessary if Series is empty, enables == checks with strings self.files = files_info.astype(np.dtype('O'))
def create_test_df_source(sim_params=None, bars='daily'): if bars == 'daily': freq = pd.datetools.BDay() elif bars == 'minute': freq = pd.datetools.Minute() else: raise ValueError('%s bars not understood.' % freq) if sim_params: index = sim_params.trading_days else: if trading.environment is None: trading.environment = trading.TradingEnvironment() start = pd.datetime(1990, 1, 3, 0, 0, 0, 0, pytz.utc) end = pd.datetime(1990, 1, 8, 0, 0, 0, 0, pytz.utc) days = trading.environment.days_in_range(start, end) if bars == 'daily': index = days if bars == 'minute': index = pd.DatetimeIndex([], freq=freq) for day in days: day_index = trading.environment.market_minutes_for_day(day) index = index.append(day_index) x = np.arange(1, len(index) + 1) df = pd.DataFrame(x, index=index, columns=[0]) return DataFrameSource(df), df
def create_test_panel_ohlc_source(sim_params=None): start = sim_params.first_open \ if sim_params else pd.datetime(1990, 1, 3, 0, 0, 0, 0, pytz.utc) end = sim_params.last_close \ if sim_params else pd.datetime(1990, 1, 8, 0, 0, 0, 0, pytz.utc) if trading.environment is None: trading.environment = trading.TradingEnvironment() index = trading.environment.days_in_range(start, end) price = np.arange(0, len(index)) + 100 high = price * 1.05 low = price * 0.95 open_ = price + .1 * (price % 2 - .5) volume = np.ones(len(index)) * 1000 arbitrary = np.ones(len(index)) df = pd.DataFrame({'price': price, 'high': high, 'low': low, 'open': open_, 'volume': volume, 'arbitrary': arbitrary}, index=index) panel = pd.Panel.from_dict({0: df}) return DataPanelSource(panel), panel
def test_divide_df_single_column(self): x = pd.DataFrame(dict(a=[2.0, 7.0, -7.0, -7.00, 3.5]), pd.date_range(pd.datetime(2015, 1, 1), periods=5)) y = pd.DataFrame(dict(b=[2.0, 3.5, 2.0, -3.5, -3.5]), pd.date_range(pd.datetime(2015, 1, 1), periods=5)) ans = list(divide_df_single_column(x, y).iloc[:, 0]) self.assertEqual(ans, [1., 2., -3.5, 2., -1.]) x = pd.DataFrame(dict(a=[2.0, np.nan, -7.0, np.nan, 3.5]), pd.date_range(pd.datetime(2015, 1, 1), periods=5)) y = pd.DataFrame(dict(b=[2.0, 3.5, np.nan, np.nan, -3.5]), pd.date_range(pd.datetime(2015, 1, 2), periods=5)) ans = list(divide_df_single_column(x, y).iloc[:, 0]) self.assertTrue(np.isnan(ans[0])) self.assertTrue(np.isnan(ans[1])) self.assertTrue(np.isnan(ans[3])) self.assertEqual(ans[2], -2.0) ans = list(divide_df_single_column( x, y, ffill=(True, False)).iloc[:, 0]) self.assertEqual(ans[1], 1.0) ans = list(divide_df_single_column( x, y, ffill=(False, True)).iloc[:, 0]) self.assertEqual(ans[4], 1.0) ans = list(divide_df_single_column( x, y, ffill=(True, True)).iloc[:, 0]) self.assertEqual(list(ans)[1:], [1., -2., -2.0, 1., -1.])
def test_yahoo_bars_to_panel_source(self): env = TradingEnvironment() finder = AssetFinder(env.engine) stocks = ['AAPL', 'GE'] env.write_data(equities_identifiers=stocks) start = pd.datetime(1993, 1, 1, 0, 0, 0, 0, pytz.utc) end = pd.datetime(2002, 1, 1, 0, 0, 0, 0, pytz.utc) data = factory.load_bars_from_yahoo(stocks=stocks, indexes={}, start=start, end=end) check_fields = ['sid', 'open', 'high', 'low', 'close', 'volume', 'price'] copy_panel = data.copy() sids = finder.map_identifier_index_to_sids( data.items, data.major_axis[0] ) copy_panel.items = sids source = DataPanelSource(copy_panel) for event in source: for check_field in check_fields: self.assertIn(check_field, event) self.assertTrue(isinstance(event['volume'], (integer_types))) self.assertTrue(event['sid'] in sids)
def create_test_panel_source(sim_params=None, source_type=None): start = sim_params.first_open \ if sim_params else pd.datetime(1990, 1, 3, 0, 0, 0, 0, pytz.utc) end = sim_params.last_close \ if sim_params else pd.datetime(1990, 1, 8, 0, 0, 0, 0, pytz.utc) if trading.environment is None: trading.environment = trading.TradingEnvironment() index = trading.environment.days_in_range(start, end) price = np.arange(0, len(index)) volume = np.ones(len(index)) * 1000 arbitrary = np.ones(len(index)) df = pd.DataFrame({'price': price, 'volume': volume, 'arbitrary': arbitrary}, index=index) if source_type: source_types = np.full(len(index), source_type) df['type'] = source_types panel = pd.Panel.from_dict({0: df}) return DataPanelSource(panel), panel
def test_interval(self): year_interval = IntervalPeriod('a_year', '01/01/2013', '02/01/2014') self._generic_test(year_interval, 366) test_interval_hours = year_interval.build(four_years_of_hours) selected_list = four_years_of_hours[test_interval_hours].tolist() self.assertEqual(selected_list[0], pd.datetime(2013,1,1,0)) self.assertEqual(selected_list[-1], pd.datetime(2014,1,1,23))
def pull_date_dicts(y, m, d, step_size, n_loops): ''' Save dictionaries of pittsburgh inspection pdf text for given date range INPUT: y, m, d = ints, date to start loop step_size = number of days to include in each sub_file n_loops = number of files to create OUTPUT: pdf_main = dict, pdf text from all dates. intermediate dicts from each loop are pickled pdf_main is also pickled The date range is broken into chunks by step_size and n_loops, to ensure that data is incrementally saved (in case of connection failure or some other terminal error). ''' start = pd.datetime(y, m, d) delta = pd.Timedelta(1, 'd') pdf_main = {} for i in xrange(n_loops): print '[%02d] START: %s' % (i, start.strftime("%Y%m%d")) pdfs = get_pdf_text(start, step_size) save_pdf_text(pdfs, '../data/pitt/pitt_%s.pkl' % start.strftime("%Y%m%d")) pdf_main = merge_two_dicts(pdf_main, pdfs) start += step_size * delta started = pd.datetime(y, m, d) ended = start - delta save_pdf_text(pdf_main, '../data/pitt/pitt_FULL_%s_to_%s.pkl' % (started.strftime("%Y%m%d"), ended.strftime("%Y%m%d"))) return pdf_main
def test_get_trades_from_positions(self): positions = pd.DataFrame([np.nan, 2, 3, np.nan, 2, 3, 3.1, 4, 3, 5, 7], pd.date_range(start=pd.datetime(2015, 1, 1), periods=11)) price = pd.DataFrame([100, 103, np.nan, 106, 110, 105, np.nan, 106, 120, np.nan, 142], pd.date_range(start=pd.datetime(2015, 1, 1), periods=11)) #trades=get_trades_from_positions(price, positions, delayfill, roundpositions, None, None, None, None) trades = get_trades_from_positions( price, positions, True, True, None, None, None, None) self.assertEqual(list(trades.trades), [ 2.0, 1.0, -1.0, 1.0, 1.0, -1.0, 2.0, 2.0]) self.assertEqual(list(trades.fill_price)[ :-1], [106.0, 106.0, 105.0, 106.0, 120.0, 142.0, 142.0]) trades = get_trades_from_positions( price, positions, False, True, None, None, None, None) self.assertEqual(list(trades.trades), [ 2.0, 1.0, -1.0, 1.0, 1.0, -1.0, 2.0, 2.0]) self.assertEqual(list(trades.fill_price), [ 103.0, 106.0, 110.0, 105.0, 106.0, 120.0, 142.0, 142.0]) trades = get_trades_from_positions( price, positions, True, False, None, None, None, None) self.assertEqual(list(trades.trades), [ 2.0, 1.0, -1.0, 1.0, 0.1, 0.9, -1.0, 2.0, 2.0]) self.assertEqual(list(trades.fill_price)[ :-1], [106.0, 106.0, 105.0, 106.0, 106.0, 120.0, 120.0, 142.0, 142.0])
def get_clean_violation_data(): '''Main function for getting and cleaning violation data''' ##Get data violations = get_complaint_data() ##Clean and filter data violations = violations[['BoroID', 'Block', 'Lot', 'Class', 'ApprovedDate']] violations = violations[~(violations.isnull().any(axis=1))] violations = violations[violations.BoroID.isin(range(1,6))] violations = violations[violations.Class.isin(['A','B','C'])] violations.ApprovedDate = pd.to_datetime(violations.ApprovedDate) start = pd.datetime(2010,4,1) end = pd.datetime(2015,3,31) allowed_date_range_violation_approval = pd.date_range(start, end, freq='D') violations = violations[(violations['ApprovedDate'].isin(allowed_date_range_violation_approval))] violations['BBL'] = map(make_BBL, violations['BoroID'], violations['Block'], violations['Lot']) violations = violations.drop(['BoroID','Block','Lot'],axis=1) ## Group by BBL and class to construct final dataframe with index=BBL and total number of violations, by class, from 04/01/2010 to 03/31/2015 as features. grouped_by_BBL = violations.groupby(['BBL','Class']).size().reset_index() grouped_by_BBL.columns = ['BBL','Class','Count'] grouped_by_BBL = grouped_by_BBL.pivot('BBL','Class','Count') grouped_by_BBL = grouped_by_BBL.fillna(0) return grouped_by_BBL
def test_pandl(self): fx = pd.DataFrame([2.0] * 10, dt_range1) price = pd.DataFrame( [100, 103, 105, 106, 110, 105, 104.5, np.nan, 120, np.nan, 142], dt_range2) trades = pd.concat([ pd.DataFrame( dict( trades=[2, 1, -1, np.nan, 1], fill_price=[102.9, 105.5, 106.5, np.nan, 106.]), pd.date_range(start=pd.datetime(2015, 1, 2), periods=5)), pd.DataFrame( dict(trades=[-1, 1, -1], fill_price=[107, 119, 132]), pd.date_range(start=pd.datetime(2015, 1, 8), periods=3)) ]) ans = pandl(price, trades, marktomarket=True, fx=fx) np.testing.assert_almost_equal(ans.pandl_base[1:], [ 0.0, 10.4, 6., 14., -16., -9., 15., 48., 78., 40. ]) ans2 = pandl(price, trades, marktomarket=False, fx=fx) np.testing.assert_almost_equal(ans2.pandl_base[1:], [10.4, 6., 0., -2., 6., 48., 78.])
def test_datetime_name_accessors(self, time_locale): # Test Monday -> Sunday and January -> December, in that sequence if time_locale is None: # If the time_locale is None, day-name and month_name should # return the english attributes expected_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] expected_months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] else: with tm.set_locale(time_locale, locale.LC_TIME): expected_days = calendar.day_name[:] expected_months = calendar.month_name[1:] # GH#11128 dti = pd.date_range(freq='D', start=datetime(1998, 1, 1), periods=365) english_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] for day, name, eng_name in zip(range(4, 11), expected_days, english_days): name = name.capitalize() assert dti.weekday_name[day] == eng_name assert dti.day_name(locale=time_locale)[day] == name ts = Timestamp(datetime(2016, 4, day)) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): assert ts.weekday_name == eng_name assert ts.day_name(locale=time_locale) == name dti = dti.append(DatetimeIndex([pd.NaT])) assert np.isnan(dti.day_name(locale=time_locale)[-1]) ts = Timestamp(pd.NaT) assert np.isnan(ts.day_name(locale=time_locale)) # GH#12805 dti = pd.date_range(freq='M', start='2012', end='2013') result = dti.month_name(locale=time_locale) expected = Index([month.capitalize() for month in expected_months]) # work around different normalization schemes # https://github.com/pandas-dev/pandas/issues/22342 if not compat.PY2: result = result.str.normalize("NFD") expected = expected.str.normalize("NFD") tm.assert_index_equal(result, expected) for date, expected in zip(dti, expected_months): result = date.month_name(locale=time_locale) expected = expected.capitalize() if not compat.PY2: result = unicodedata.normalize("NFD", result) expected = unicodedata.normalize("NFD", result) assert result == expected dti = dti.append(DatetimeIndex([pd.NaT])) assert np.isnan(dti.month_name(locale=time_locale)[-1])
def readRange(fileh,opt,st,dt,tname,key=False): #key=False 将9:30之前的处理掉- #合并返回数据 #对输入暂时不进行处理 start_date=pd.to_datetime(st) end_date=pd.to_datetime(dt) opt_name='OP'+str(opt) dtt=dtt=pd.date_range(start=start_date,end=end_date) #get Par GG=[] for ik in dtt: tmp=fetchPartition(fileh,opt_name,ik) if tmp: GG.append(tmp) #get df,clean it ,concut it DD=pd.DataFrame() for jk in GG: data=fetchTable(jk,tname) df=pd.DataFrame.from_records(data,index=data['timestamp'].\ astype('datetime64[ns]'),exclude=['timestamp']) #filter y=df.index[1].year m=df.index[1].month d=df.index[1].day t_filter=((df.index>pd.datetime(y,m,d,9,30))\ &(df.index<=pd.datetime(y,m,d,11,30))) | (df.index>pd.datetime(y,m,d,13,0)) DD=pd.concat([DD,df[t_filter]]) return DD
def __init__(self): # format of links to the txt files self.url_base = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{0}.txt" # first day of data self.begining_of_time = datetime(2010, 5, 1) # date when format of data changed self.new_era = datetime(2014, 10, 18) self.today = datetime.today() # prepare station df for old format data self.data_dir = "static/data/" station_df_path = os.path.join(self.data_dir, "station.pkl") if os.path.isfile(station_df_path): with open(station_df_path) as f: self.station_df = pickle.load(f) else: self.station_df = pd.read_excel( "http://web.mta.info/developers/resources/nyct/turnstile/Remote-Booth-Station.xls" ) self.station_df.columns = [ "UNIT", "C/A", "STATION", "LINENAME", "DIVISION" ] # save to data directory if not os.path.exists(self.data_dir): os.makedirs(self.data_dir) with open(station_df_path, "wb") as f: pickle.dump(self.station_df, f)
def test_write_metrics1(): filename = abspath(join(testdir, 'test_write_metrics1.csv')) if isfile(filename): os.remove(filename) metrics = pd.DataFrame({'metric1' : pd.Series([1.], index=[pd.datetime(2016,1,1)])}) pecos.io.write_metrics(filename, metrics) assert_true(isfile(filename)) from_file1 = pd.read_csv(filename) assert_equals(from_file1.shape, (1,2)) # append another date metrics = pd.DataFrame({'metric1' : pd.Series([2.], index=[pd.datetime(2016,1,2)])}) pecos.io.write_metrics(filename, metrics) from_file2 = pd.read_csv(filename) assert_equals(from_file2.shape, (2,2)) # append another metric metrics = pd.DataFrame({'metric2' : pd.Series([3.], index=[pd.datetime(2016,1,2)])}) pecos.io.write_metrics(filename, metrics) from_file3= pd.read_csv(filename) assert_equals(from_file3.shape, (2,3))
def create_test_df_source(sim_params=None, bars='daily'): if bars == 'daily': freq = pd.datetools.BDay() elif bars == 'minute': freq = pd.datetools.Minute() else: raise ValueError('%s bars not understood.' % freq) if sim_params: index = sim_params.trading_days else: start = pd.datetime(1990, 1, 3, 0, 0, 0, 0, pytz.utc) end = pd.datetime(1990, 1, 8, 0, 0, 0, 0, pytz.utc) index = pd.DatetimeIndex( start=start, end=end, freq=freq ) if bars == 'minute': new_index = [] for i in index: market_open = i.replace(hour=14, minute=31) market_close = i.replace(hour=21, minute=0) if i >= market_open and i <= market_close: new_index.append(i) index = new_index x = np.arange(1, len(index) + 1) df = pd.DataFrame(x, index=index, columns=[0]) return DataFrameSource(df), df
def main(): stock_name_list = ['IBM','AAPL','C'] start = pd.datetime(2011,11,21) end = pd.datetime(2012,3,21) stock_class_list = {stock: Stock(stock,start,end) for stock in stock_name_list} pricecols = {stock:stock_class.closeprice for stock,stock_class in stock_class_list.iteritems()} closed_price_df = pd.DataFrame(pricecols) print closed_price_df.head()
def setUp(self): setup_logger(self) start = pd.datetime(1993, 1, 1, 0, 0, 0, 0, pytz.utc) end = pd.datetime(1994, 1, 1, 0, 0, 0, 0, pytz.utc) self.data = factory.load_from_yahoo(stocks=['AAPL'], indexes={}, start=start, end=end)
def test_single_orbit_call_orbit_starts_0_UT_using_next(self): self.testInst.load(2009,1) self.testInst.orbits.next() ans = (self.testInst.data.index[0] == pds.datetime(2009,1,1)) ans2 = (self.testInst.data.index[-1] == (pds.datetime(2009,1,1,1,36,59) )) # print (ans,ans2) # print (self.testInst.data.index[0], self.testInst.data.index[-1]) assert ans & ans2
def _get_default_series(self): """ What we return if currency rates match """ DEFAULT_DATES = pd.date_range(start=pd.datetime(1970, 1, 1), end=pd.datetime(2050, 1, 1)) DEFAULT_RATE_SERIES = pd.DataFrame(dict(fx=[1.0] * len(DEFAULT_DATES)), index=DEFAULT_DATES) return DEFAULT_RATE_SERIES
def test_single_orbit_call_by_1_index(self): self.testInst.load(2009,1) self.testInst.orbits[1] ans = (self.testInst.data.index[0] == pds.datetime(2009,1,1,1,37)) ans2 = (self.testInst.data.index[-1] == (pds.datetime(2009,1,1,3,13,59) )) # print (ans,ans2) # print (self.testInst.data.index[0], self.testInst.data.index[-1]) assert ans & ans2
def test_single_orbit_call_orbit_starts_off_0_UT_using_next(self): from dateutil.relativedelta import relativedelta as relativedelta self.testInst.load(2008,366) self.testInst.orbits.next() # print self.testInst.data.index[0], pds.datetime(2008,12,30, 23, 45), self.testInst.data.index[-1], (pds.datetime(2008,12,30, 23, 45)+relativedelta(hours=1, minutes=36, seconds=59) ) ans = (self.testInst.data.index[0] == pds.datetime(2008,12,30, 23, 45)) ans2 = (self.testInst.data.index[-1] == (pds.datetime(2008,12,30, 23, 45)+relativedelta(hours=1, minutes=36, seconds=59) )) assert ans & ans2
def create_test_df_source(): start = pd.datetime(1990, 1, 3, 0, 0, 0, 0, pytz.utc) end = pd.datetime(1990, 1, 8, 0, 0, 0, 0, pytz.utc) index = pd.DatetimeIndex(start=start, end=end, freq=pd.datetools.day) x = np.arange(2., len(index) * 2 + 2).reshape((-1, 2)) df = pd.DataFrame(x, index=index, columns=[0, 1]) return DataFrameSource(df), df
def getOutcomes(allData): outcomes = ['fully_funded','great_chat','is_exciting','at_least_1_teacher_referred_donor','at_least_1_green_donation','three_or_more_non_teacher_referred_donors','one_non_teacher_referred_donor_giving_100_plus','donation_from_thoughtful_donor'] data = allData[outcomes] train = data[(allData['date_posted']>=pd.datetime(2011,7,1)) & (allData['date_posted']<pd.datetime(2013,7,1))] cv = data[(allData['date_posted']>=pd.datetime(2013,7,1)) & (allData['date_posted']<pd.datetime(2013,10,1))] predict = data[(allData['date_posted']<pd.datetime(2014,1,1)) & (allData['date_posted']>=pd.datetime(2012,1,1))] test = data[(allData['date_posted']>=pd.datetime(2013,10,1)) & (allData['date_posted']<pd.datetime(2014,1,1))] return train, cv, predict, test
def friBeforeLastTues(yy,mm): first = p.datetime(yy,mm,1) lastDay = p.Timestamp(first).days_in_month lastDate = p.datetime(yy, mm, lastDay) dow = lastDate.isoweekday() if dow<2: inc = 7 else: inc = 0 return lastDay - 2 - dow - inc
def test_constructor_coverage(self): rng = date_range('1/1/2000', periods=10.5) exp = date_range('1/1/2000', periods=10) tm.assert_index_equal(rng, exp) msg = 'periods must be a number, got foo' with pytest.raises(TypeError, match=msg): date_range(start='1/1/2000', periods='foo', freq='D') with pytest.raises(ValueError): with tm.assert_produces_warning(FutureWarning): DatetimeIndex(start='1/1/2000', end='1/10/2000') with pytest.raises(TypeError): DatetimeIndex('1/1/2000') # generator expression gen = (datetime(2000, 1, 1) + timedelta(i) for i in range(10)) result = DatetimeIndex(gen) expected = DatetimeIndex([datetime(2000, 1, 1) + timedelta(i) for i in range(10)]) tm.assert_index_equal(result, expected) # NumPy string array strings = np.array(['2000-01-01', '2000-01-02', '2000-01-03']) result = DatetimeIndex(strings) expected = DatetimeIndex(strings.astype('O')) tm.assert_index_equal(result, expected) from_ints = DatetimeIndex(expected.asi8) tm.assert_index_equal(from_ints, expected) # string with NaT strings = np.array(['2000-01-01', '2000-01-02', 'NaT']) result = DatetimeIndex(strings) expected = DatetimeIndex(strings.astype('O')) tm.assert_index_equal(result, expected) from_ints = DatetimeIndex(expected.asi8) tm.assert_index_equal(from_ints, expected) # non-conforming msg = ("Inferred frequency None from passed values does not conform" " to passed frequency D") with pytest.raises(ValueError, match=msg): DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04'], freq='D') msg = ("Of the four parameters: start, end, periods, and freq, exactly" " three must be specified") with pytest.raises(ValueError, match=msg): date_range(start='2011-01-01', freq='b') with pytest.raises(ValueError, match=msg): date_range(end='2011-01-01', freq='B') with pytest.raises(ValueError, match=msg): date_range(periods=10, freq='D')
def _calculate_historical_statistics(sonde_file, parameter, averaging, recent_years=3): sonde_data = _read_sonde_data(sonde_file) sonde_param_data = sonde_data[parameter] sonde_param_data[sonde_param_data < -900] = np.nan sonde_param_data[np.logical_and(sonde_param_data.index.month==2, sonde_param_data.index.day==29)] = np.nan sonde_param_data.dropna(inplace=True) final_year = sonde_param_data.index.year[-1] first_year = sonde_param_data.first_valid_index().year year_str = '(%s - %s)' % (first_year, final_year-1) historical_enddate = pd.datetime(final_year - 1, 12, 31, 23, 59) historical_data = sonde_param_data.ix[:historical_enddate] if averaging == 'monthly': grouped_monthly_data = historical_data.groupby(lambda d: d.month) hist_stat = grouped_monthly_data.describe() hist_stat = pd.DataFrame({ 'min': grouped_monthly_data.min(), 'mean': grouped_monthly_data.mean(), 'max': grouped_monthly_data.max()}) for year_ago in np.arange(recent_years): start_date = pd.datetime(final_year - year_ago, 1, 1) end_date = pd.datetime(final_year - year_ago, 12, 31, 23, 59) monthly_mean = _calculate_mean(sonde_param_data.ix[start_date:end_date], 'M') hist_stat[str(start_date.year)] = pd.DataFrame(monthly_mean.values, index=monthly_mean.index.month) else: grouped_daily_data = historical_data.groupby(lambda d: (d.month, d.day)) hist_stat = pd.DataFrame({ 'min': grouped_daily_data.min(), 'mean': grouped_daily_data.mean(), 'max': grouped_daily_data.max()}) try: hist_stat.index = np.arange(1,366) #requiring historical data to have a minimum of one record for each day of year. except ValueError: raise ValueError("The merged data file %s doesn't have the minimum required record length of five years." % sonde_file) for year_ago in np.arange(recent_years): start_date = pd.datetime(final_year - year_ago, 1, 1) end_date = pd.datetime(final_year - year_ago, 12, 31, 23, 59) daily_mean = _calculate_mean(sonde_param_data.ix[start_date:end_date], 'D') daily_mean.index = daily_mean.index.dayofyear hist_stat[str(start_date.year)] = pd.DataFrame(daily_mean.values, index=daily_mean.index) hist_stat.year_range = year_str hist_stat.final_year = final_year hist_stat.first_year = first_year return hist_stat
def test_construction_outofbounds(self): # GH 13663 dates = [datetime(3000, 1, 1), datetime(4000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)] exp = Index(dates, dtype=object) # coerces to object tm.assert_index_equal(Index(dates), exp) with pytest.raises(OutOfBoundsDatetime): # can't create DatetimeIndex DatetimeIndex(dates)
import plotly.graph_objs as go import pandas as pd from app import app terrorism = pd.read_csv('apps/data/terrorism.csv', encoding='latin-1', low_memory=False, usecols=[ 'iyear', 'imonth', 'iday', 'country_txt', 'city', 'longitude', 'latitude', 'nkill', 'nwound', 'summary', 'target1', 'gname' ]) terrorism = terrorism[terrorism['imonth'] != 0] terrorism['day_clean'] = [15 if x == 0 else x for x in terrorism['iday']] terrorism['date'] = [ pd.datetime(y, m, d) for y, m, d in zip( terrorism['iyear'], terrorism['imonth'], terrorism['day_clean']) ] from app import app layout = html.Div([ html.Br(), html.H3('Global Terrorism Database: 1970 - 2016'), html.A('Explore Cities', href='/country'), dcc.Graph(id='map_world', config={'displayModeBar': False}), html.Div([ dcc.RangeSlider( id='years', min=1970, max=2016,
''' Created on 3 Dec 2015 @author: rob ''' import unittest import pandas as pd import numpy as np from pysystemtrade.syscore.accounting import pandl, get_positions_from_forecasts, get_trades_from_positions dt_range1 = pd.date_range(start=pd.datetime(2014, 12, 30), periods=10) dt_range2 = pd.date_range(start=pd.datetime(2015, 1, 1), periods=11) class Test(unittest.TestCase): def test_get_positions_from_forecasts(self): fx = pd.DataFrame([2.0] * 10, dt_range1) price = pd.DataFrame( [100, 103, 105, 106, 110, 105, np.nan, 106, 120, np.nan, 142], dt_range2) forecast = pd.DataFrame([ np.nan, np.nan, np.nan, np.nan, 10.0, 10.0, 15.0, 15.0, 5.0, 0.0, -5.0 ], dt_range2) value_of_price_point = 150.0 daily_return_volatility = None position = get_positions_from_forecasts(price, daily_return_volatility,
""" First some constants """ CALENDAR_DAYS_IN_YEAR = 365.25 BUSINESS_DAYS_IN_YEAR = 256.0 ROOT_BDAYS_INYEAR = BUSINESS_DAYS_IN_YEAR**.5 WEEKS_IN_YEAR = CALENDAR_DAYS_IN_YEAR / 7.0 ROOT_WEEKS_IN_YEAR = WEEKS_IN_YEAR**.5 MONTHS_IN_YEAR = 12.0 ROOT_MONTHS_IN_YEAR = MONTHS_IN_YEAR**.5 ARBITRARY_START = pd.datetime(1900, 1, 1) HOURS_PER_DAY = 24 MINUTES_PER_HOUR = 60 SECONDS_PER_HOUR = 60 SECONDS_IN_YEAR = CALENDAR_DAYS_IN_YEAR * HOURS_PER_DAY * MINUTES_PER_HOUR * SECONDS_PER_HOUR UNIXTIME_CONVERTER = 1e9 UNIXTIME_IN_YEAR = UNIXTIME_CONVERTER * SECONDS_IN_YEAR MONTH_LIST = ["F", "G", "H", "J", "K", "M", "N", "Q", "U", "V", "X", "Z"] def month_from_contract_letter(contract_letter): """
def process_data(base_path): import pandas as pd # processed_dataset = {} # validation == 1000 samples # train === 5000 samples # test === 1000 samples # convert to number of actions per week # edit out the badge outcome variables print("Processing raw data") output_fname = os.path.join(base_path, 'so_data.pkl') labels = ['train', 'valid', 'test'] input_fname = os.path.join(csv_path, 'so_badges.csv') data = pd.read_csv(input_fname) data.Date = pd.to_datetime(data.Date) data['week'] = (data.Date - pd.datetime(year=2017, month=1, day=1)).dt.days data = data.groupby(['DummyUserId', 'week']).agg('sum').reset_index() badge_ixs = data[data.Electorate > 0] max_week = data.week.max() badge_ixs = badge_ixs[badge_ixs.week > 45] badge_ixs = badge_ixs[badge_ixs.week < max_week - 46] badge_ixs = badge_ixs.DummyUserId print(len(badge_ixs.unique())) indexes = badge_ixs.unique() train = np.random.choice(indexes, size=4000, replace=False) indexes = indexes[~np.in1d(indexes, train)] validate = np.random.choice(indexes, size=1000, replace=False) indexes = indexes[~np.in1d(indexes, validate)] test = np.random.choice(indexes, size=1000, replace=False) # data.set_index('DummyUserId', inplace=True) processed_dataset = {} for s, dset in enumerate([train, validate, test]): split = labels[s] processed_dataset[split] = {} sub_data = data[data.DummyUserId.isin(dset)] n_seqs = len(dset) processed_dataset[split]['sequence_lengths'] = torch.zeros( n_seqs, dtype=torch.long) processed_dataset[split]['sequences'] = [] processed_dataset[split]['outcomes'] = [] idx = 0 for u_id, seqs in sub_data.groupby('DummyUserId'): seqs = seqs.sort_values('week') out = {} for b in BADGES: idxs = np.where(seqs[b] == 1)[0] if len(idxs) > 0: out[b] = torch.tensor(idxs, dtype=torch.long) civic_duty = out['Electorate'] days = 90 action_vec = seqs[ACTIONS].values[civic_duty - days // 2:civic_duty + days // 2, :] out['Electorate'] = torch.tensor([days // 2], dtype=torch.long) processed_dataset[split]['sequence_lengths'][idx] = days processed_sequence = torch.tensor(action_vec, dtype=torch.long) processed_dataset[split]['sequences'].append(processed_sequence) processed_dataset[split]['outcomes'].append(out) idx += 1 pickle.dump(processed_dataset, open(output_fname, "wb"), pickle.HIGHEST_PROTOCOL) print("dumped processed data to %s" % output_fname)
def main(fobs, fcable, case_name, ring, term): # _________________________ CABLE ___________________________ cable = nc.Dataset(fcable, 'r') Time = nc.num2date(cable.variables['time'][:], cable.variables['time'].units) Rainf = pd.DataFrame(cable.variables['Rainf'][:, 0, 0], columns=['Rainf']) Rainf = Rainf * 1800. Rainf['dates'] = Time Rainf = Rainf.set_index('dates') Rainf = Rainf.resample("D").agg('sum') Rainf.index = Rainf.index - pd.datetime(2011, 12, 31) Rainf.index = Rainf.index.days var = pd.DataFrame(cable.variables[term][:, 0, 0], columns=['var']) #var = pd.DataFrame(cable.variables['Rnet'][:,0,0]-cable.variables['Qg'][:,0,0],columns=['var']) var['dates'] = Time var = var.set_index('dates') var = var.resample("D").agg('mean') var.index = var.index - pd.datetime(2011, 12, 31) var.index = var.index.days Tair = pd.DataFrame(cable.variables['Tair'][:, 0, 0] - 273.15, columns=['Tair']) Tair['dates'] = Time Tair = Tair.set_index('dates') Tair = Tair.resample("D").agg('max') Tair.index = Tair.index - pd.datetime(2011, 12, 31) Tair.index = Tair.index.days # exclude rainday and the after two days of rain day = np.zeros((len(var)), dtype=bool) for i in np.arange(0, len(var)): if (Tair.values[i] >= 35. and Rainf.values[i] == 0.): day[i] = True event = 0 con_max = 0 i = 0 while i < len(var) - 2: if np.all([day[i:i + 3]]): event += 1 i += 3 con = 3 while day[i]: con += 1 i += 1 else: con = 0 i += 1 if con > con_max: con_max = con print(event) print(con_max) v = np.zeros((event, con_max)) lct = np.zeros((event, con_max)) v[:, :] = np.nan for con in np.arange(1, con_max + 1): lct[:, con - 1] = con i = 0 j = 0 while i < len(var) - 2: if (np.all([day[i:i + 3]])): print(Tair.index[i]) print(var['var'].values[i]) v[j, 0] = var['var'].values[i] v[j, 1] = var['var'].values[i + 1] v[j, 2] = var['var'].values[i + 2] i = i + 3 cont_day = 3 while day[i]: v[j, cont_day] = var['var'].values[i] i += 1 cont_day += 1 j += 1 else: i += 1 print(v) print(lct) #return np.ravel(v),np.ravel(lct); return v, lct
"ZEEL":"Media & Entertainment", "HINDALCO":"Metals & Mining", "VEDL":"Metals & Mining", "JSWSTEEL":"Metals & Mining", "TATASTEEL":"Metals & Mining", "COALINDIA":"Metals & Mining", "CIPLA":"Pharma", "DRREDDY":"Pharma", "SUNPHARMA":"Pharma", "ADANIPORTS":"Shipping","MUNDRAPORT":"Shipping", "BHARTIARTL":"Telecom" } df["SECTORS"] = df["Symbol"].map(sectors) df # In[4]: df_2017 = df[(df.index >= pd.datetime(2017, 1, 1)) & (df.index <= pd.datetime(2017, 12, 31))] cmp_17 = df_2017.groupby(["Symbol"]) companies_17 = cmp_17.resample('MS').mean() companies_17 # In[5]: df_2018 = df[(df.index >= pd.datetime(2018, 1, 1)) & (df.index <= pd.datetime(2018, 12, 31))] cmp_18 = df_2018.groupby(["Symbol"]) companies_18 = cmp_18.resample('MS').mean() companies_18
def transform_editing_data_to_file_folder_structure(path_to_csv_actions, path_to_csv_badges, path_to_data_dir): ''' Expecting data in the PIVOTED format from the Stack Overflow query editor. Here the csv file has an index of userIds, and the columns are the date from start to end. The values are the counts of edits that that user performed on that day. There is a separate file for the userId. ''' import tqdm data_actions = pd.read_csv(path_to_csv_actions) badge_achievements = pd.read_csv(path_to_csv_badges) data_actions = data_actions[data_actions.UserId.isin( badge_achievements.UserId)] badge_achievements = badge_achievements[badge_achievements.UserId.isin( data_actions.UserId)] start_date = pd.datetime(year=2009, month=1, day=1) badge_achievements.Date = pd.to_datetime(badge_achievements.Date) badge_achievements['day'] = (badge_achievements.Date - start_date).dt.days user_ids = badge_achievements.UserId.unique() size_data = len(user_ids) np.random.seed(11) train = np.random.choice(user_ids, size=int(np.floor(0.6 * size_data)), replace=False) user_ids = user_ids[~np.in1d(user_ids, train)] validate = np.random.choice(user_ids, size=int(np.floor(0.2 * size_data)), replace=False) user_ids = user_ids[~np.in1d(user_ids, validate)] test = np.random.choice(user_ids, size=int(np.floor(0.2 * size_data)), replace=False) data_actions.set_index('UserId', inplace=True) badge_achievements.set_index('UserId', inplace=True) num_days = (badge_achievements.Date.max() - start_date).days for dset in [train, validate, test]: for user in tqdm.tqdm(dset): trajectory = data_actions.loc[user] trajectory = trajectory.reset_index() trajectory['index'] = pd.to_datetime(trajectory['index']) trajectory['day'] = (trajectory['index'] - start_date).dt.days trajectory.rename(columns={ 'index': 'date', user: '******' }, inplace=True) trajectory.sort_values('day', inplace=True) trajectory.set_index('day', inplace=True) trajectory = trajectory.reindex(range(num_days + 1), fill_value=0) action_trajectory = torch.tensor(trajectory[['num_actions' ]].values, dtype=torch.long) torch.save(action_trajectory, '{}/user_{}.pt'.format(path_to_data_dir, user)) with open('{}/badge_achievements.json'.format(path_to_data_dir), 'w') as f: badge_dict = badge_achievements['day'].to_dict() badge_dict = { k: { 'strunk_white': [int(v)] } for k, v in badge_dict.items() } json.dump(badge_dict, f) with open('{}/data_indexes.json'.format(path_to_data_dir), 'w') as f: obj = {} obj['train'] = [int(u) for u in train] obj['test'] = [int(u) for u in test] obj['validate'] = [int(u) for u in validate] json.dump(obj, f)
def read_data(vids, data_path, t_division): this_data_file_name = t_division + 'data.hdf5' this_division_file_name = t_division + 'division.hdf5' data_path_file_names = os.listdir('../') if this_data_file_name in data_path_file_names and this_division_file_name in data_path_file_names: print('reading previously saved data') sum_hourly_viewers = pd.read_hdf('../' + this_division_file_name) in_file = h5py.File('../' + this_data_file_name, 'r') account_data = np.copy(in_file['account_data']) org_data = np.copy(in_file['org_data']) vids_out = np.copy(in_file['vids_out']) in_file.close() else: account_data = [] org_data = [] vids_out = [] for vid_ind, vid in enumerate(vids): file_name = data_path + vid + '.06-17.08-02.hdf5' df = pd.read_hdf(file_name) if len(df) == 0: print(file_name, end='') else: df = df.append( pd.DataFrame( { 'first_start_time': pd.datetime(2016, 6, 16), 'account_id': '', 'org_id': '' }, [-1])) df = df.append( pd.DataFrame( { 'first_start_time': pd.datetime(2016, 8, 3), 'account_id': '', 'org_id': '' }, [len(df.account_id)])) df_reind = df.copy() df_reind = df_reind.set_index(['first_start_time']) year_month_day_hour = pd.to_datetime( 2016 * 1000000 + df_reind.index.month * 10000 + df_reind.index.day * 100 + df_reind.index.hour, format='%Y%m%d%H') df_reind['year_month_day_hour'] = year_month_day_hour num_current_viewers = df_reind.groupby( 'year_month_day_hour').account_id.nunique() num_current_orgs = df_reind.groupby( 'year_month_day_hour').org_id.nunique() sum_hourly_viewers = num_current_viewers.resample( t_division).sum() sum_hourly_viewers = sum_hourly_viewers.fillna(0) sum_hourly_viewers = sum_hourly_viewers[ '2016-06-18 00:00:00':'2016-07-27 0:00:00'] sum_hourly_orgs = num_current_orgs.resample(t_division).sum() sum_hourly_orgs = sum_hourly_orgs.fillna(0) sum_hourly_orgs = sum_hourly_orgs[ '2016-06-18 00:00:00':'2016-07-27 0:00:00'] shu_array = sum_hourly_viewers.values.astype('float') sho_array = sum_hourly_orgs.values.astype('float') account_data.append(shu_array.tolist()) org_data.append(sho_array.tolist()) vids_out.append(vid) print(str(vid_ind) + ' ', end='') account_data = np.array(account_data) org_data = np.array(org_data) vids_out = np.array(vids_out) print('\nsaving data to ' + this_division_file_name + ' ' + this_data_file_name) sum_hourly_viewers.to_hdf('../' + this_division_file_name, 'w') out_file = h5py.File('../' + this_data_file_name, 'w') out_file.create_dataset('account_data', data=account_data) out_file.create_dataset('org_data', data=org_data) out_file.create_dataset('vids_out', data=vids_out) out_file.flush() out_file.close() t = sum_hourly_viewers.index return vids_out, t, account_data, org_data
def test_datetimeindex_constructor_misc(self): arr = ['1/1/2005', '1/2/2005', 'Jn 3, 2005', '2005-01-04'] pytest.raises(Exception, DatetimeIndex, arr) arr = ['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04'] idx1 = DatetimeIndex(arr) arr = [datetime(2005, 1, 1), '1/2/2005', '1/3/2005', '2005-01-04'] idx2 = DatetimeIndex(arr) arr = [lib.Timestamp(datetime(2005, 1, 1)), '1/2/2005', '1/3/2005', '2005-01-04'] idx3 = DatetimeIndex(arr) arr = np.array(['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04'], dtype='O') idx4 = DatetimeIndex(arr) arr = to_datetime(['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04']) idx5 = DatetimeIndex(arr) arr = to_datetime(['1/1/2005', '1/2/2005', 'Jan 3, 2005', '2005-01-04' ]) idx6 = DatetimeIndex(arr) idx7 = DatetimeIndex(['12/05/2007', '25/01/2008'], dayfirst=True) idx8 = DatetimeIndex(['2007/05/12', '2008/01/25'], dayfirst=False, yearfirst=True) tm.assert_index_equal(idx7, idx8) for other in [idx2, idx3, idx4, idx5, idx6]: assert (idx1.values == other.values).all() sdate = datetime(1999, 12, 25) edate = datetime(2000, 1, 1) idx = DatetimeIndex(start=sdate, freq='1B', periods=20) assert len(idx) == 20 assert idx[0] == sdate + 0 * offsets.BDay() assert idx.freq == 'B' idx = DatetimeIndex(end=edate, freq=('D', 5), periods=20) assert len(idx) == 20 assert idx[-1] == edate assert idx.freq == '5D' idx1 = DatetimeIndex(start=sdate, end=edate, freq='W-SUN') idx2 = DatetimeIndex(start=sdate, end=edate, freq=offsets.Week(weekday=6)) assert len(idx1) == len(idx2) assert idx1.offset == idx2.offset idx1 = DatetimeIndex(start=sdate, end=edate, freq='QS') idx2 = DatetimeIndex(start=sdate, end=edate, freq=offsets.QuarterBegin(startingMonth=1)) assert len(idx1) == len(idx2) assert idx1.offset == idx2.offset idx1 = DatetimeIndex(start=sdate, end=edate, freq='BQ') idx2 = DatetimeIndex(start=sdate, end=edate, freq=offsets.BQuarterEnd(startingMonth=12)) assert len(idx1) == len(idx2) assert idx1.offset == idx2.offset
price_regressor = False #time_series.columns = ['ds', 'y', 'weekends', 'snap', 'floor'] time_series.columns = ['ds', 'y', 'weekends', 'snap'] #time_max = np.max(time_series['y']) * 1.1 #time_series['cap'] = time_max time_series['cum7'] = cum7[i, (start_date - 1):-28] # time_series['cum14'] = cum14[i, (start_date-1):-28] # time_series['cum28'] = cum28[i, (start_date-1):-28] # time_series['cum56'] = cum56[i, (start_date-1):-28] time_series['cum_max'] = cum_max[i, (start_date - 1):-28] time_series['cum_zero'] = cum_zero[i, (start_date - 1):-28] end_train = len(time_series) - 28 time_series.loc[:, 'ds'] = pd.datetime(2011, 1, 29) + pd.to_timedelta( time_series['ds'] - 1, unit='d') m = Prophet(uncertainty_samples=0, holidays=holidays, changepoint_prior_scale=0.9, holidays_prior_scale=0.05, yearly_seasonality=5) #growth='logistic') # m.add_country_holidays(country_name='US') if price_regressor == True: m.add_regressor('price') m.add_regressor('weekends') m.add_regressor('snap')
def __init__(self, ascategory=True, t0=pd.datetime(2000, 1, 1)): self.ascategory = ascategory self.t0 = t0
#计算模型RFM的值 #groupby()得到一维数组,并且索引是groupby 的索引 #以id汇总销售日期,并取其最大值 recency_data=sales_data['ORDERDATE'].groupby(sales_data.index).max() #以id汇总订单,比计算订单个数 frequency_values=sales_data['ORDERID'].groupby(sales_data.index).count() #订单金额 monetary_value=sales_data['AMOUNTINFO'].groupby(sales_data.index).sum() print(type(monetary_value)) #计算RMF的得分 deadline_data=pd.datetime(2017, 1, 1) #指定最后期限,计算时间间隔,也即是模型R值 r_interval=(deadline_data - recency_data).dt.days #针对series格式:Series.dt.days,Number of days for each element #cut将根据值本身来选择箱子均匀间隔,qcut是根据这些值的频率来选择箱子的均匀间隔 #因此cut,lables可能不全部表现,比如最小值1,最大值10,除此之外没有,分5分,labels=【1,2,3,4,5】,最后的label只有1和5 r_score=pd.cut(x=r_interval,bins=5,labels=[5,4,3,2,1])#日期越小,越好;分位数的labels,针对的排序是由小到大,值最大,labels最靠后 f_score=pd.cut(x=frequency_values,bins=5,labels=[1,2,3,4,5]) m_score=pd.cut(x=monetary_value,bins=5,labels=[1,2,3,4,5]) #rfm数值合并,数据框 rfm_list=[r_score,f_score,m_score] #组成列表 rfm_col_names=['r_score','f_score','m_score']
parse_dates=["date"], skiprows=range(1, 66458909) # 2016-01-01 ) df_test = pd.read_csv( "../Data/test.csv", usecols=[0, 1, 2, 3, 4], dtype={ 'onpromotion': bool }, parse_dates=["date"] # , date_parser=parser ).set_index(['store_nbr', 'item_nbr', 'date']) items = pd.read_csv("../Data/items.csv", ).set_index("item_nbr") df_2017 = df_train.loc[df_train.date >= pd.datetime(2017, 1, 1)] del df_train promo_2017_train = df_2017.set_index(["store_nbr", "item_nbr", "date" ])[["onpromotion" ]].unstack(level=-1).fillna(False) promo_2017_train.columns = promo_2017_train.columns.get_level_values(1) promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False) promo_2017_test.columns = promo_2017_test.columns.get_level_values(1) promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False) promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1) del promo_2017_test, promo_2017_train df_2017 = df_2017.set_index(["store_nbr", "item_nbr", "date" ])[["unit_sales"]].unstack(level=-1).fillna(0) df_2017.columns = df_2017.columns.get_level_values(1)
def download_volatility(config): """Downloads volatility data from OMI website.""" url = 'https://realized.oxford-man.ox.ac.uk/images/oxfordmanrealizedvolatilityindices.zip' data_folder = config.data_folder csv_path = os.path.join(data_folder, 'oxfordmanrealizedvolatilityindices.csv') zip_path = os.path.join(data_folder, 'oxfordmanrealizedvolatilityindices.zip') download_and_unzip(url, zip_path, csv_path, data_folder) print('Unzip complete. Adding extra inputs') df = pd.read_csv(csv_path, index_col=0) # no explicit index # Adds additional date/day fields idx = [str(s).split('+')[0] for s in df.index] # ignore timezones, we don't need them dates = pd.to_datetime(idx) df['date'] = dates df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days df['day_of_week'] = dates.dayofweek df['day_of_month'] = dates.day df['week_of_year'] = dates.weekofyear df['month'] = dates.month df['year'] = dates.year df['categorical_id'] = df['Symbol'].copy() # Processes log volatility vol = df['rv5_ss'].copy() vol.loc[vol == 0.] = np.nan df['log_vol'] = np.log(vol) # Adds static information symbol_region_mapping = { '.AEX': 'EMEA', '.AORD': 'APAC', '.BFX': 'EMEA', '.BSESN': 'APAC', '.BVLG': 'EMEA', '.BVSP': 'AMER', '.DJI': 'AMER', '.FCHI': 'EMEA', '.FTMIB': 'EMEA', '.FTSE': 'EMEA', '.GDAXI': 'EMEA', '.GSPTSE': 'AMER', '.HSI': 'APAC', '.IBEX': 'EMEA', '.IXIC': 'AMER', '.KS11': 'APAC', '.KSE': 'APAC', '.MXX': 'AMER', '.N225': 'APAC ', '.NSEI': 'APAC', '.OMXC20': 'EMEA', '.OMXHPI': 'EMEA', '.OMXSPI': 'EMEA', '.OSEAX': 'EMEA', '.RUT': 'EMEA', '.SMSI': 'EMEA', '.SPX': 'AMER', '.SSEC': 'APAC', '.SSMI': 'EMEA', '.STI': 'APAC', '.STOXX50E': 'EMEA' } df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k]) # Performs final processing output_df_list = [] for grp in df.groupby('Symbol'): sliced = grp[1].copy() sliced.sort_values('days_from_start', inplace=True) # Impute log volatility values sliced['log_vol'].fillna(method='ffill', inplace=True) sliced.dropna() output_df_list.append(sliced) df = pd.concat(output_df_list, axis=0) output_file = config.data_csv_path print('Completed formatting, saving to {}'.format(output_file)) df.to_csv(output_file) print('Done.')
#TODO: Using the same data as before, instead of a two dimensional histogram, break it up as above #text and annotation plt.style.use('seaborn-whitegrid') births = pd.read_csv(path + 'births.csv') quartiles = np.percentile(births['births'], [25, 50, 75]) mu, sig = quartiles[1], 0.74 * (quartiles[2] - quartiles[0]) births = births.query('(births > @mu - 5 * @sig) & (births < @mu + 5 * @sig)') births['day'] = births['day'].astype(int) births.index = pd.to_datetime(10000 * births.year + 100 * births.month + births.day, format='%Y%m%d') births_by_date = births.pivot_table('births', [births.index.month, births.index.day]) births_by_date.index = [pd.datetime(2012, month, day) for (month, day) in births_by_date.index] fig, ax = plt.subplots(figsize=(12, 4)) births_by_date.plot(ax=ax); fig, ax = plt.subplots(figsize=(12, 4)) births_by_date.plot(ax=ax) # Add labels to the plot style = dict(size=10, color='gray') ax.text('2012-1-1', 3950, "New Year's Day", **style) ax.text('2012-7-4', 4250, "Independence Day", ha='center', **style) ax.text('2012-9-4', 4850, "Labor Day", ha='center', **style) ax.text('2012-10-31', 4600, "Halloween", ha='right', **style) ax.text('2012-11-25', 4450, "Thanksgiving", ha='center', **style) ax.text('2012-12-25', 3850, "Christmas ", ha='right', **style) # Label the axes ax.set(title='USA births by day of year (1969-1988)',
ax.set_title('Show only input flows') plt.show() # ***** 4. example *************************************************** # Create a plot to show the balance around a bus. # Order and colors are customisable. inorder = [(('pv', 'electricity'), 'flow'), (('wind', 'electricity'), 'flow'), (('storage', 'electricity'), 'flow'), (('pp_gas', 'electricity'), 'flow')] fig = plt.figure(figsize=(10, 5)) electricity_seq = views.node(results, 'electricity')['sequences'] plot_slice = oev.plot.slice_df(electricity_seq, date_from=pd.datetime(2012, 2, 15)) my_plot = oev.plot.io_plot('electricity', plot_slice, cdict=cdict, inorder=inorder, ax=fig.add_subplot(1, 1, 1), smooth=False) ax = shape_legend('electricity', **my_plot) oev.plot.set_datetime_ticks(ax, plot_slice.index, tick_distance=48, date_format='%d-%m-%H', offset=12) ax.set_ylabel('Power in MW') ax.set_xlabel('2012') ax.set_title("Electricity bus, non-smoothed representation") # ***** 5. example *************************************************** # Create a plot to show the balance around a bus. # Make a smooth plot even though it is not scientifically correct.
def hours_of_daylight(date, axis=23.44, latitude=47.61): """Compute the hours of daylight for the given date""" days = (date - pd.datetime(2000, 12, 21)).days m = (1. - np.tan(np.radians(latitude)) * np.tan(np.radians(axis) * np.cos(days * 2 * np.pi / 365.25))) return 24. * np.degrees(np.arccos(1 - np.clip(m, 0, 2))) / 180.
# determine unique values in a column users.occupation.nunique() # count the number of unique values users.occupation.unique() # return the unique values # replace all instances of a value in a column (must match entire value) ufo.State.replace('Fl', 'FL', inplace=True) # string methods are accessed via 'str' ufo.State.str.upper() # converts to uppercase ufo.Colors_Reported.str.contains('RED', na='False') # checks for a substring # convert a string to the datetime format ufo['Time'] = pd.to_datetime(ufo.Time) ufo.Time.dt.hour # datetime format exposes convenient attributes (ufo.Time.max() - ufo.Time.min()).days # also allows you to do datetime "math" ufo[ufo.Time > pd.datetime(2014, 1, 1)] # boolean filtering with datetime format # setting and then removing an index ufo.set_index('Time', inplace=True) ufo.reset_index(inplace=True) # sort a column by its index ufo.State.value_counts().sort_index() # change the data type of a column drinks['beer'] = drinks.beer.astype('float') # change the data type of a column when reading in a file pd.read_csv('drinks.csv', dtype={'beer_servings':float}) # create dummy variables for 'continent' and exclude first dummy column
from hydroDL.master import basins from hydroDL.app import waterQuality from hydroDL import kPath, utils from hydroDL.model import trainTS from hydroDL.post import axplot, figplot from hydroDL.data import usgs, gageII, gridMET, ntn, transform import torch import os import json import numpy as np import pandas as pd import time import matplotlib.pyplot as plt import statsmodels.api as sm startDate = pd.datetime(1979, 1, 1) endDate = pd.datetime(2020, 1, 1) sn = 1 codeLst = usgs.newC dirSel = os.path.join(kPath.dirData, 'USGS', 'inventory', 'siteSel') with open(os.path.join(dirSel, 'dictRB_Y30N5.json')) as f: dictSite = json.load(f) siteNoLst = dictSite['comb'] t0 = time.time() dirRoot = os.path.join(kPath.dirWQ, 'modelStat', 'WRTDS-W') dirOut = os.path.join(dirRoot, 'B10') for folder in [dirRoot, dirOut]: if not os.path.exists(folder): os.mkdir(folder)
'Long Weighted Return Tc', 'Short Weighted Return Tc' ]] # Remove useless columns dfRaw2 = pd.read_csv('dbo_famafrench.txt', delimiter='\t') dfRaw3 = dfRaw2.loc[:, ['d', 'mktrf', 'smb', 'hml', 'rf', 'umd']] # De-normalized table #dfRaw1['Row Labels'].dt.date dfRaw1['Row Labels'] = pd.to_datetime(dfRaw1['Row Labels']) dfRaw1.set_index('Row Labels', inplace=True) dfRaw3['d'] = dfRaw3['d'].str.split(expand=True) dfRaw3['d'] = pd.to_datetime(dfRaw3['d']) dfRaw3.set_index('d', inplace=True) result = dfRaw1.join(dfRaw3, how='inner') start = pd.datetime(2001, 01, 01) end = pd.datetime(2004, 12, 31) q2Data = result[(result.index >= start) & (result.index <= end)] ##Q2 (a) def ComputeAnnualStat(portfolioType, startYear, yearLength): annualReturn = [] for i in range(yearLength): start = pd.datetime(startYear + i, 01, 01) end = pd.datetime(startYear + i, 12, 31) annualData = result[(result.index >= start) & (result.index <= end)] annualReturn.append(annualData[portfolioType].sum()) meanAnlReturn = np.average(annualReturn) anlVolatility = np.std(annualReturn) anlSharpe = meanAnlReturn / anlVolatility
import pandas as pd import numpy as np import scipy as sp print(pd.datetime.now()) # current date and time print(pd.Timestamp('2017-03-01')) # timestamp print(pd.Timestamp(1587687255, unit='s')) print(pd.date_range("11:00", "13:30", freq="30min").time) print(pd.to_datetime(pd.Series(['Jul 31, 2009', '2010-01-10', None]))) print(pd.to_datetime(['2005/11/23', '2010.12.31', None])) print(pd.date_range('1/1/2011', periods=5)) print(pd.date_range('1/1/2011', periods=5, freq='M')) start = pd.datetime(2011, 1, 1) end = pd.datetime(2011, 1, 5) print(pd.date_range(start, end)) print(pd.Timedelta('2 days 2 hours 15 minutes 30 seconds')) print(pd.Timedelta(6, unit='h')) print(pd.Timedelta(days=2)) s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D')) td = pd.Series([pd.Timedelta(days=i) for i in range(3)]) df = pd.DataFrame(dict(A=s, B=td)) print(df) df['C'] = df['A'] + df['B'] print(df)
def main(fobs_Esoil, fobs_vwc, fcable, case_name, ring, layer, ep_type): est_esoil = pd.read_csv(fobs_Esoil, usecols=['Ring', 'Date', 'wuTP', 'EfloorPred']) est_esoil['Date'] = pd.to_datetime(est_esoil['Date'], format="%d/%m/%Y", infer_datetime_format=False) est_esoil['Date'] = est_esoil['Date'] - pd.datetime(2011, 12, 31) est_esoil['Date'] = est_esoil['Date'].dt.days est_esoil = est_esoil.sort_values(by=['Date']) # divide neo into groups if ring == 'amb': subset = est_esoil[(est_esoil['Ring'].isin(['R2', 'R3', 'R6'])) & (est_esoil.Date > 366)] elif ring == 'ele': subset = est_esoil[(est_esoil['Ring'].isin(['R1', 'R4', 'R5'])) & (est_esoil.Date > 366)] else: subset = est_esoil[(est_esoil['Ring'].isin([ring])) & (est_esoil.Date > 366)] subset = subset.groupby(by=["Date"]).mean() #subset['wuTP'] = subset['wuTP'].replace(NA, float('nan')) subset['wuTP'] = subset['wuTP'].clip(lower=0.) subset['wuTP'] = subset['wuTP'].replace(0., float('nan')) #subset['EfloorPred'] = subset['EfloorPred'].replace(NA, float('nan')) subset['EfloorPred'] = subset['EfloorPred'].clip(lower=0.) subset['EfloorPred'] = subset['EfloorPred'].replace(0., float('nan')) #subset = subset.xs('swc.tdr', axis=1, drop_level=True) #print(subset) # tdr at 30cm depth tdr_30 = pd.read_csv(fobs_vwc, usecols=['Ring', 'Date', 'vwcMean']) tdr_30['Date'] = pd.to_datetime(tdr_30['Date'], format="%d/%m/%Y", infer_datetime_format=False) tdr_30['Date'] = tdr_30['Date'] - pd.datetime(2011, 12, 31) tdr_30['Date'] = tdr_30['Date'].dt.days tdr_30 = tdr_30.sort_values(by=['Date']) # divide neo into groups if ring == 'amb': subset1 = tdr_30[(tdr_30['Ring'].isin(['R2', 'R3', 'R6'])) & (tdr_30.Date > 366)] elif ring == 'ele': subset1 = tdr_30[(tdr_30['Ring'].isin(['R1', 'R4', 'R5'])) & (tdr_30.Date > 366)] else: subset1 = tdr_30[(tdr_30['Ring'].isin([ring])) & (tdr_30.Date > 366)] subset1 = subset1.groupby(by=["Date"]).mean() subset1['vwcMean'] = subset1['vwcMean'].clip(lower=0.) subset1['vwcMean'] = subset1['vwcMean'].replace(0., float('nan')) #subset1['vwcMean'] = subset1['wuTP'].replace('NA', float('nan')) #print(subset1) # _________________________ CABLE ___________________________ cable = nc.Dataset(fcable, 'r') Time = nc.num2date(cable.variables['time'][:], cable.variables['time'].units) SoilMoist = pd.DataFrame(cable.variables['SoilMoist'][:, 0, 0, 0], columns=['SoilMoist']) if layer == "6": SoilMoist['SoilMoist'] = ( cable.variables['SoilMoist'][:,0,0,0]*0.022 \ + cable.variables['SoilMoist'][:,1,0,0]*0.058 \ + cable.variables['SoilMoist'][:,2,0,0]*0.154 \ + cable.variables['SoilMoist'][:,3,0,0]*(0.5-0.022-0.058-0.154) )/0.5 elif layer == "13": SoilMoist['SoilMoist'] = ( cable.variables['SoilMoist'][:,0,0,0]*0.02 \ + cable.variables['SoilMoist'][:,1,0,0]*0.05 \ + cable.variables['SoilMoist'][:,2,0,0]*0.06 \ + cable.variables['SoilMoist'][:,3,0,0]*0.13 \ + cable.variables['SoilMoist'][:,3,0,0]*(0.5-0.02-0.05-0.06-0.13) )/0.5 elif layer == "31uni": SoilMoist['SoilMoist'] = ( cable.variables['SoilMoist'][:,0,0,0]*0.15 \ + cable.variables['SoilMoist'][:,1,0,0]*0.15 \ + cable.variables['SoilMoist'][:,2,0,0]*0.15 \ + cable.variables['SoilMoist'][:,3,0,0]*0.05 )/0.5 elif layer == "31exp": SoilMoist['SoilMoist'] = ( cable.variables['SoilMoist'][:,0,0,0]*0.020440 \ + cable.variables['SoilMoist'][:,1,0,0]*0.001759 \ + cable.variables['SoilMoist'][:,2,0,0]*0.003957 \ + cable.variables['SoilMoist'][:,3,0,0]*0.007035 \ + cable.variables['SoilMoist'][:,4,0,0]*0.010993 \ + cable.variables['SoilMoist'][:,5,0,0]*0.015829 \ + cable.variables['SoilMoist'][:,6,0,0]*0.021546 \ + cable.variables['SoilMoist'][:,7,0,0]*0.028141 \ + cable.variables['SoilMoist'][:,8,0,0]*0.035616 \ + cable.variables['SoilMoist'][:,9,0,0]*0.043971 \ + cable.variables['SoilMoist'][:,10,0,0]*0.053205 \ + cable.variables['SoilMoist'][:,11,0,0]*0.063318 \ + cable.variables['SoilMoist'][:,12,0,0]*0.074311 \ + cable.variables['SoilMoist'][:,13,0,0]*0.086183 \ + cable.variables['SoilMoist'][:,14,0,0]*(0.5-0.466304))/0.5 elif layer == "31para": SoilMoist['SoilMoist'] = ( cable.variables['SoilMoist'][:,0,0,0]*0.020440 \ + cable.variables['SoilMoist'][:,1,0,0]*0.001759 \ + cable.variables['SoilMoist'][:,2,0,0]*0.003957 \ + cable.variables['SoilMoist'][:,3,0,0]*0.007035 \ + cable.variables['SoilMoist'][:,4,0,0]*0.010993 \ + cable.variables['SoilMoist'][:,5,0,0]*0.015829 \ + cable.variables['SoilMoist'][:,6,0,0]*(0.5-0.420714))/0.5 SoilMoist['dates'] = Time SoilMoist = SoilMoist.set_index('dates') SoilMoist = SoilMoist.resample("D").agg('mean') SoilMoist.index = SoilMoist.index - pd.datetime(2011, 12, 31) SoilMoist.index = SoilMoist.index.days SoilMoist = SoilMoist.sort_values(by=['dates']) ESoil = pd.DataFrame(cable.variables['ESoil'][:, 0, 0], columns=['ESoil']) ESoil = ESoil * 1800. ESoil['dates'] = Time ESoil = ESoil.set_index('dates') ESoil = ESoil.resample("D").agg('sum') ESoil.index = ESoil.index - pd.datetime(2011, 12, 31) ESoil.index = ESoil.index.days #print(ESoil) if ep_type == 'PotEvap': Ep = pd.DataFrame(cable.variables['PotEvap'][:, 0, 0], columns=['Ep']) Ep = Ep * 1800. Ep['dates'] = Time Ep = Ep.set_index('dates') Ep = Ep.resample("D").agg('sum') Ep.index = Ep.index - pd.datetime(2011, 12, 31) Ep.index = Ep.index.days #print(Ep) elif ep_type == 'Rnet-G': Ep = pd.DataFrame(cable.variables['Rnet'][:, 0, 0] - cable.variables['Qg'][:, 0, 0], columns=['Ep']) Ep['dates'] = Time Ep = Ep.set_index('dates') Ep = Ep.resample("D").agg('mean') Ep.index = Ep.index - pd.datetime(2011, 12, 31) Ep.index = Ep.index.days print(Ep * 86400 / 2454000) Rainf = pd.DataFrame(cable.variables['Rainf'][:, 0, 0], columns=['Rainf']) Rainf = Rainf * 1800. Rainf['dates'] = Time Rainf = Rainf.set_index('dates') Rainf = Rainf.resample("D").agg('sum') Rainf.index = Rainf.index - pd.datetime(2011, 12, 31) Rainf.index = Rainf.index.days rain = Rainf['Rainf'].loc[np.all( [Rainf.index.isin(subset.index), Rainf.index.isin(subset1.index)], axis=0)] esoil = ESoil['ESoil'].loc[np.all( [ESoil.index.isin(subset.index), ESoil.index.isin(subset1.index)], axis=0)] ep = Ep['Ep'].loc[np.all( [Ep.index.isin(subset.index), Ep.index.isin(subset1.index)], axis=0)] soilmoist = SoilMoist['SoilMoist'].loc[np.all([ SoilMoist.index.isin(subset.index), SoilMoist.index.isin(subset1.index) ], axis=0)] wuTP = subset['wuTP'].loc[np.all( [subset.index.isin(subset1.index), subset.index.isin(SoilMoist.index)], axis=0)] EfloorPred = subset['EfloorPred'].loc[np.all( [subset.index.isin(subset1.index), subset.index.isin(SoilMoist.index)], axis=0)] vwcMean = subset1['vwcMean'].loc[subset1.index.isin(subset.index)] #.loc[np.all([subset1.index.isin(subset.index), subset.index.isin(SoilMoist.index)],axis=0)] # exclude tdr soilmoisture < 0 or tdr esoil < 0 mask = np.any([np.isnan(wuTP), np.isnan(vwcMean)], axis=0) print(mask) rain = rain[mask == False] esoil = esoil[mask == False] ep = ep[mask == False] soilmoist = soilmoist[mask == False] wuTP = wuTP[mask == False] EfloorPred = EfloorPred[mask == False] vwcMean = vwcMean[mask == False] print("any(rain>0.)") print(np.any(rain > 0.)) # exclude rainday and the after two days of rain mask = np.ones((len(rain)), dtype=bool) #print(rain) if rain.values[0] > 0.: mask[0] = False if rain.values[0] > 0. or rain.values[1] > 0.: mask[1] = False for i in np.arange(2, len(rain)): if rain.values[i] > 0. or rain.values[i - 1] > 0. or rain.values[i - 2] > 0.: mask[i] = False rain = rain[mask == True] esoil = esoil[mask == True] ep = ep[mask == True] soilmoist = soilmoist[mask == True] wuTP = wuTP[mask == True] EfloorPred = EfloorPred[mask == True] vwcMean = vwcMean[mask == True] print("any(rain>0.)") print(np.any(rain > 0.)) # exclude the days Rnet < 0. ep = ep.clip(lower=0.) ep = ep.replace(0., float('nan')) mask = np.isnan(ep) esoil = esoil[mask == False] ep = ep[mask == False] soilmoist = soilmoist[mask == False] wuTP = wuTP[mask == False] EfloorPred = EfloorPred[mask == False] vwcMean = vwcMean[mask == False] if ep_type == 'PotEvap': rate = esoil / ep rate_tdr = wuTP / ep #wuTP/ep elif ep_type == 'Rnet-G': rate = esoil / (ep * 86400 / 2454000) rate_tdr = wuTP / (ep * 86400 / 2454000) print("-------------------------------------------------") print(np.any(esoil < 0.)) print(np.any(ep < 0.)) print(np.any(soilmoist < 0.)) print(np.any(wuTP < 0.)) print(np.any(vwcMean < 0.)) print(esoil) print(ep) print(soilmoist) print(wuTP) print(vwcMean) print(rate) print(rate_tdr) print("-------------------------------------------------") # ____________________ Plot obs _______________________ fig = plt.figure(figsize=[15, 10]) fig.subplots_adjust(hspace=0.1) fig.subplots_adjust(wspace=0.05) plt.rcParams['text.usetex'] = False plt.rcParams['font.family'] = "sans-serif" plt.rcParams['font.sans-serif'] = "Helvetica" plt.rcParams['axes.labelsize'] = 14 plt.rcParams['font.size'] = 14 plt.rcParams['legend.fontsize'] = 10 plt.rcParams['xtick.labelsize'] = 14 plt.rcParams['ytick.labelsize'] = 14 almost_black = '#262626' # change the tick colors also to the almost black plt.rcParams['ytick.color'] = almost_black plt.rcParams['xtick.color'] = almost_black # change the text colors also to the almost black plt.rcParams['text.color'] = almost_black # Change the default axis colors from black to a slightly lighter black, # and a little thinner (0.5 instead of 1) plt.rcParams['axes.edgecolor'] = almost_black plt.rcParams['axes.labelcolor'] = almost_black ax1 = fig.add_subplot(111) ax1.scatter(soilmoist, rate, s=2, marker='o', c='orange') ax1.scatter(vwcMean, rate_tdr, s=2, marker='o', c='green') ax1.set_xlim(0., 0.4) ax1.set_ylim(0., 1.) fig.savefig("EucFACE_Esoil_E0_theta_Gimeno-tdr_%s_%s_%s.png" \ % (ep_type, case_name, ring), bbox_inches='tight', pad_inches=0.1)
def remove_outlier(self): idx = pd.datetime(2017, 4, 15, 23) self.data.drop(index=idx, inplace=True) self.feat.drop(index=idx, inplace=True)
def main(args=None): r"""MarketFlow Main Program Notes ----- (1) Initialize logging. (2) Parse the command line arguments. (3) Get the market configuration. (4) Get the model configuration. (5) Create the model object. (6) Call the main MarketFlow pipeline. Raises ------ ValueError Training date must be before prediction date. """ # Logging logging.basicConfig(format="[%(asctime)s] %(levelname)s\t%(message)s", filename="market_flow.log", filemode='a', level=logging.DEBUG, datefmt='%m/%d/%y %H:%M:%S') formatter = logging.Formatter("[%(asctime)s] %(levelname)s\t%(message)s", datefmt='%m/%d/%y %H:%M:%S') console = logging.StreamHandler() console.setFormatter(formatter) console.setLevel(logging.INFO) logging.getLogger().addHandler(console) # Start the pipeline logger.info('*' * 80) logger.info("MarketFlow Start") logger.info('*' * 80) # Argument Parsing parser = argparse.ArgumentParser(description="MarketFlow Parser") parser.add_argument('--pdate', dest='predict_date', help="prediction date is in the format: YYYY-MM-DD", required=False, type=valid_date) parser.add_argument('--tdate', dest='train_date', help="training date is in the format: YYYY-MM-DD", required=False, type=valid_date) parser.add_mutually_exclusive_group(required=False) parser.add_argument('--predict', dest='predict_mode', action='store_true') parser.add_argument('--train', dest='predict_mode', action='store_false') parser.set_defaults(predict_mode=False) args = parser.parse_args() # Set train and predict dates if args.train_date: train_date = args.train_date else: train_date = pd.datetime(1900, 1, 1).strftime("%Y-%m-%d") if args.predict_date: predict_date = args.predict_date else: predict_date = datetime.date.today().strftime("%Y-%m-%d") # Verify that the dates are in sequence. if train_date >= predict_date: raise ValueError("Training date must be before prediction date") else: logger.info("Training Date: %s", train_date) logger.info("Prediction Date: %s", predict_date) # Read stock configuration file market_specs = get_market_config() # Read model configuration file model_specs = get_model_config() model_specs['predict_mode'] = args.predict_mode model_specs['predict_date'] = predict_date model_specs['train_date'] = train_date # Create directories if necessary output_dirs = [ 'config', 'data', 'input', 'model', 'output', 'plots', 'systems' ] for od in output_dirs: output_dir = SSEP.join([model_specs['directory'], od]) if not os.path.exists(output_dir): logger.info("Creating directory %s", output_dir) os.makedirs(output_dir) # Create a model from the arguments logger.info("Creating Model") model = Model(model_specs) # Start the pipeline model = market_pipeline(model, market_specs) # Complete the pipeline logger.info('*' * 80) logger.info("MarketFlow End") logger.info('*' * 80)
print 'idx_lon: ', idx_lon, 'NetCDF lon: ', val_lon, 'given lon: ', target_lon print 'norm: ', least_norm print '\n' return idx_lat, idx_lon, val_lat, val_lon if __name__ == "__main__": # Target lat/lon (CHANGE THIS) target_lat = -41.26101779 target_lon = 148.166736 # Generate filenames and read it to formulate dataframes (CHANGE THIS) path_to_file = '//home//thorweather//gfs_files//nc//' start_date = pd.datetime(2015, 1, 15) #YYYY,month,day end_date = pd.datetime(2017, 3, 25) #YYYY,month,day utc_datetime_range = pd.date_range(start=start_date, end=end_date, freq='6H') start_of_loop = 1 for date_time in utc_datetime_range: # Generate filename (We can do it from reading netcdf directly) str_year = str(date_time.year) str_month = str('%02d' % (date_time.month)) str_day = str('%02d' % (date_time.day)) str_fcst_hour = str('%02d' % (date_time.hour)) netcdf_filename = 'gfs.0p25.' + str_year + str_month + str_day + str_fcst_hour + '.f000.grib2.abrie233580.nc' file_path = path_to_file + netcdf_filename
def test_comparisons_nat(self): fidx1 = pd.Index([1.0, np.nan, 3.0, np.nan, 5.0, 7.0]) fidx2 = pd.Index([2.0, 3.0, np.nan, np.nan, 6.0, 7.0]) didx1 = pd.DatetimeIndex([ '2014-01-01', pd.NaT, '2014-03-01', pd.NaT, '2014-05-01', '2014-07-01' ]) didx2 = pd.DatetimeIndex([ '2014-02-01', '2014-03-01', pd.NaT, pd.NaT, '2014-06-01', '2014-07-01' ]) darr = np.array([ np_datetime64_compat('2014-02-01 00:00Z'), np_datetime64_compat('2014-03-01 00:00Z'), np_datetime64_compat('nat'), np.datetime64('nat'), np_datetime64_compat('2014-06-01 00:00Z'), np_datetime64_compat('2014-07-01 00:00Z') ]) if _np_version_under1p8: # cannot test array because np.datetime('nat') returns today's date cases = [(fidx1, fidx2), (didx1, didx2)] else: cases = [(fidx1, fidx2), (didx1, didx2), (didx1, darr)] # Check pd.NaT is handles as the same as np.nan with tm.assert_produces_warning(None): for idx1, idx2 in cases: result = idx1 < idx2 expected = np.array([True, False, False, False, True, False]) tm.assert_numpy_array_equal(result, expected) result = idx2 > idx1 expected = np.array([True, False, False, False, True, False]) tm.assert_numpy_array_equal(result, expected) result = idx1 <= idx2 expected = np.array([True, False, False, False, True, True]) tm.assert_numpy_array_equal(result, expected) result = idx2 >= idx1 expected = np.array([True, False, False, False, True, True]) tm.assert_numpy_array_equal(result, expected) result = idx1 == idx2 expected = np.array([False, False, False, False, False, True]) tm.assert_numpy_array_equal(result, expected) result = idx1 != idx2 expected = np.array([True, True, True, True, True, False]) tm.assert_numpy_array_equal(result, expected) with tm.assert_produces_warning(None): for idx1, val in [(fidx1, np.nan), (didx1, pd.NaT)]: result = idx1 < val expected = np.array([False, False, False, False, False, False]) tm.assert_numpy_array_equal(result, expected) result = idx1 > val tm.assert_numpy_array_equal(result, expected) result = idx1 <= val tm.assert_numpy_array_equal(result, expected) result = idx1 >= val tm.assert_numpy_array_equal(result, expected) result = idx1 == val tm.assert_numpy_array_equal(result, expected) result = idx1 != val expected = np.array([True, True, True, True, True, True]) tm.assert_numpy_array_equal(result, expected) # Check pd.NaT is handles as the same as np.nan with tm.assert_produces_warning(None): for idx1, val in [(fidx1, 3), (didx1, datetime(2014, 3, 1))]: result = idx1 < val expected = np.array([True, False, False, False, False, False]) tm.assert_numpy_array_equal(result, expected) result = idx1 > val expected = np.array([False, False, False, False, True, True]) tm.assert_numpy_array_equal(result, expected) result = idx1 <= val expected = np.array([True, False, True, False, False, False]) tm.assert_numpy_array_equal(result, expected) result = idx1 >= val expected = np.array([False, False, True, False, True, True]) tm.assert_numpy_array_equal(result, expected) result = idx1 == val expected = np.array([False, False, True, False, False, False]) tm.assert_numpy_array_equal(result, expected) result = idx1 != val expected = np.array([True, True, False, True, True, True]) tm.assert_numpy_array_equal(result, expected)
import pysat import pandas as pds import numpy as np import numpy.ma as ma import matplotlib.pyplot as plt # dates for demo ssnDays = 67 startDate = pds.datetime(2009, 12, 21) - pds.DateOffset(days=ssnDays) stopDate = pds.datetime(2009, 12, 21) + pds.DateOffset(days=ssnDays) # define functions to customize data for application def geo2mag(incoord): """geographic coordinate to magnetic coordinate (coarse): Parameters ---------- incoord : numpy.array of shape (2,*) array([[glat0,glat1,glat2,...],[glon0,glon1,glon2,...]), where glat, glon are geographic latitude and longitude (or if you have only one point it is [[glat,glon]]). Warnings -------- Calculation of geomagnetic coordinates is approximate. Coordinates are for a geomagnetic dipole, not the full field. Location of geomagnetic dipole set for 2010. Returns -------
def test_datetimeindex_accessors(self): dti_naive = DatetimeIndex(freq='D', start=datetime(1998, 1, 1), periods=365) # GH 13303 dti_tz = DatetimeIndex(freq='D', start=datetime(1998, 1, 1), periods=365, tz='US/Eastern') for dti in [dti_naive, dti_tz]: self.assertEqual(dti.year[0], 1998) self.assertEqual(dti.month[0], 1) self.assertEqual(dti.day[0], 1) self.assertEqual(dti.hour[0], 0) self.assertEqual(dti.minute[0], 0) self.assertEqual(dti.second[0], 0) self.assertEqual(dti.microsecond[0], 0) self.assertEqual(dti.dayofweek[0], 3) self.assertEqual(dti.dayofyear[0], 1) self.assertEqual(dti.dayofyear[120], 121) self.assertEqual(dti.weekofyear[0], 1) self.assertEqual(dti.weekofyear[120], 18) self.assertEqual(dti.quarter[0], 1) self.assertEqual(dti.quarter[120], 2) self.assertEqual(dti.days_in_month[0], 31) self.assertEqual(dti.days_in_month[90], 30) self.assertEqual(dti.is_month_start[0], True) self.assertEqual(dti.is_month_start[1], False) self.assertEqual(dti.is_month_start[31], True) self.assertEqual(dti.is_quarter_start[0], True) self.assertEqual(dti.is_quarter_start[90], True) self.assertEqual(dti.is_year_start[0], True) self.assertEqual(dti.is_year_start[364], False) self.assertEqual(dti.is_month_end[0], False) self.assertEqual(dti.is_month_end[30], True) self.assertEqual(dti.is_month_end[31], False) self.assertEqual(dti.is_month_end[364], True) self.assertEqual(dti.is_quarter_end[0], False) self.assertEqual(dti.is_quarter_end[30], False) self.assertEqual(dti.is_quarter_end[89], True) self.assertEqual(dti.is_quarter_end[364], True) self.assertEqual(dti.is_year_end[0], False) self.assertEqual(dti.is_year_end[364], True) # GH 11128 self.assertEqual(dti.weekday_name[4], u'Monday') self.assertEqual(dti.weekday_name[5], u'Tuesday') self.assertEqual(dti.weekday_name[6], u'Wednesday') self.assertEqual(dti.weekday_name[7], u'Thursday') self.assertEqual(dti.weekday_name[8], u'Friday') self.assertEqual(dti.weekday_name[9], u'Saturday') self.assertEqual(dti.weekday_name[10], u'Sunday') self.assertEqual(Timestamp('2016-04-04').weekday_name, u'Monday') self.assertEqual(Timestamp('2016-04-05').weekday_name, u'Tuesday') self.assertEqual( Timestamp('2016-04-06').weekday_name, u'Wednesday') self.assertEqual(Timestamp('2016-04-07').weekday_name, u'Thursday') self.assertEqual(Timestamp('2016-04-08').weekday_name, u'Friday') self.assertEqual(Timestamp('2016-04-09').weekday_name, u'Saturday') self.assertEqual(Timestamp('2016-04-10').weekday_name, u'Sunday') self.assertEqual(len(dti.year), 365) self.assertEqual(len(dti.month), 365) self.assertEqual(len(dti.day), 365) self.assertEqual(len(dti.hour), 365) self.assertEqual(len(dti.minute), 365) self.assertEqual(len(dti.second), 365) self.assertEqual(len(dti.microsecond), 365) self.assertEqual(len(dti.dayofweek), 365) self.assertEqual(len(dti.dayofyear), 365) self.assertEqual(len(dti.weekofyear), 365) self.assertEqual(len(dti.quarter), 365) self.assertEqual(len(dti.is_month_start), 365) self.assertEqual(len(dti.is_month_end), 365) self.assertEqual(len(dti.is_quarter_start), 365) self.assertEqual(len(dti.is_quarter_end), 365) self.assertEqual(len(dti.is_year_start), 365) self.assertEqual(len(dti.is_year_end), 365) self.assertEqual(len(dti.weekday_name), 365) dti.name = 'name' # non boolean accessors -> return Index for accessor in [ 'year', 'month', 'day', 'hour', 'minute', 'second', 'microsecond', 'nanosecond', 'dayofweek', 'dayofyear', 'weekofyear', 'quarter', 'weekday_name' ]: res = getattr(dti, accessor) assert len(res) == 365 assert isinstance(res, Index) assert res.name == 'name' # boolean accessors -> return array for accessor in [ 'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'is_year_start', 'is_year_end' ]: res = getattr(dti, accessor) assert len(res) == 365 assert isinstance(res, np.ndarray) # test boolean indexing res = dti[dti.is_quarter_start] exp = dti[[0, 90, 181, 273]] tm.assert_index_equal(res, exp) res = dti[dti.is_leap_year] exp = DatetimeIndex([], freq='D', tz=dti.tz, name='name') tm.assert_index_equal(res, exp) dti = DatetimeIndex(freq='BQ-FEB', start=datetime(1998, 1, 1), periods=4) self.assertEqual(sum(dti.is_quarter_start), 0) self.assertEqual(sum(dti.is_quarter_end), 4) self.assertEqual(sum(dti.is_year_start), 0) self.assertEqual(sum(dti.is_year_end), 1) # Ensure is_start/end accessors throw ValueError for CustomBusinessDay, # CBD requires np >= 1.7 bday_egypt = offsets.CustomBusinessDay(weekmask='Sun Mon Tue Wed Thu') dti = date_range(datetime(2013, 4, 30), periods=5, freq=bday_egypt) self.assertRaises(ValueError, lambda: dti.is_month_start) dti = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03']) self.assertEqual(dti.is_month_start[0], 1) tests = [(Timestamp('2013-06-01', freq='M').is_month_start, 1), (Timestamp('2013-06-01', freq='BM').is_month_start, 0), (Timestamp('2013-06-03', freq='M').is_month_start, 0), (Timestamp('2013-06-03', freq='BM').is_month_start, 1), (Timestamp('2013-02-28', freq='Q-FEB').is_month_end, 1), (Timestamp('2013-02-28', freq='Q-FEB').is_quarter_end, 1), (Timestamp('2013-02-28', freq='Q-FEB').is_year_end, 1), (Timestamp('2013-03-01', freq='Q-FEB').is_month_start, 1), (Timestamp('2013-03-01', freq='Q-FEB').is_quarter_start, 1), (Timestamp('2013-03-01', freq='Q-FEB').is_year_start, 1), (Timestamp('2013-03-31', freq='QS-FEB').is_month_end, 1), (Timestamp('2013-03-31', freq='QS-FEB').is_quarter_end, 0), (Timestamp('2013-03-31', freq='QS-FEB').is_year_end, 0), (Timestamp('2013-02-01', freq='QS-FEB').is_month_start, 1), (Timestamp('2013-02-01', freq='QS-FEB').is_quarter_start, 1), (Timestamp('2013-02-01', freq='QS-FEB').is_year_start, 1), (Timestamp('2013-06-30', freq='BQ').is_month_end, 0), (Timestamp('2013-06-30', freq='BQ').is_quarter_end, 0), (Timestamp('2013-06-30', freq='BQ').is_year_end, 0), (Timestamp('2013-06-28', freq='BQ').is_month_end, 1), (Timestamp('2013-06-28', freq='BQ').is_quarter_end, 1), (Timestamp('2013-06-28', freq='BQ').is_year_end, 0), (Timestamp('2013-06-30', freq='BQS-APR').is_month_end, 0), (Timestamp('2013-06-30', freq='BQS-APR').is_quarter_end, 0), (Timestamp('2013-06-30', freq='BQS-APR').is_year_end, 0), (Timestamp('2013-06-28', freq='BQS-APR').is_month_end, 1), (Timestamp('2013-06-28', freq='BQS-APR').is_quarter_end, 1), (Timestamp('2013-03-29', freq='BQS-APR').is_year_end, 1), (Timestamp('2013-11-01', freq='AS-NOV').is_year_start, 1), (Timestamp('2013-10-31', freq='AS-NOV').is_year_end, 1), (Timestamp('2012-02-01').days_in_month, 29), (Timestamp('2013-02-01').days_in_month, 28)] for ts, value in tests: self.assertEqual(ts, value)
import numpy as np import matplotlib.dates as mdates from matplotlib.ticker import MultipleLocator, FormatStrFormatter from scipy.stats.stats import pearsonr from global_functions import read_datatxt plt.rc('text', usetex=True) plt.rc('font', family='serif') plt.rcParams['xtick.labelsize'] = 13 plt.rcParams['ytick.labelsize'] = 13 plt.rcParams['axes.labelsize'] = 16 plt.rcParams['axes.titlesize'] = 16 # Rn: mdnRnA = np.loadtxt('../../../mdnRnA.txt', delimiter=',') neuron = [64, 32] startday = pd.datetime(2013, 7, 1) ## Plot: CNN_loss = np.loadtxt('./CNN_Loss_Rn_{}_{}.txt'.format(neuron[0], neuron[1]), delimiter=',') train_loss = CNN_loss[0] test_loss = CNN_loss[1] fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6)) plt.plot(train_loss) plt.plot(test_loss) plt.ylabel('loss', fontsize=14) plt.xlabel('epoch', fontsize=14) plt.legend(['train', 'test'], loc='upper right') plt.tight_layout() plt.savefig('./CNN_Loss_Rn_{}_{}.pdf'.format(neuron[0], neuron[1]))
def el_to_dt(cell): yr = int(cell / 10000) + 1900 mth = cell - int(cell / 10000) * 10000 mth = int(mth / 100) day = cell - int(cell / 100) * 100 return pd.datetime(yr, mth, day).date()