def get_time_range_from_files(files): """Get time range from input files.""" times = [re.findall('\d\d\d\d\d\d+', fname) for fname in files] t_1 = [t1 for t1, t2 in times] t_2 = [t2 for t1, t2 in times] t_min, t_max = min(t_1), max(t_2) #times = np.array(times, 'i4').reshape(len(times), 2) #t_min, t_max = str(times.min()), str(times.max()) y1, m1 = int(t_min[:4]), int(t_min[4:6]) y2, m2 = int(t_max[:4]), int(t_max[4:6]) # seasonal if len(t_min) > 6 and len(t_max) > 6: y1, m1 = get_season(y1, m1, get_month=1) y2, m2 = get_season(y2, m2, get_month=1) start = '%d/%d/%d' % (y1, m1, 15) end = '%d/%d/%d' % (y2, m2, 15) return pn.DateRange(start, end, offset=pn.datetools.DateOffset(months=3)) # monthly else: print t_min start = '%d/%d/%d' % (y1, m1, 15) end = '%d/%d/%d' % (y2, m2, 15) return pn.DateRange(start, end, offset=pn.datetools.DateOffset(months=1))
def analysis(self, year, month, force=False): if year and month: self.range = pandas.DateRange(datetime.datetime(year,month,1,12) , periods = 1 , offset = pandas.DateOffset(months=1) ) else: self.range = pandas.DateRange(self.start_date , end = self.end_date , offset = pandas.DateOffset(months=1) ) mr_dict = self.map_reduce(force=force) return self.cohort_analysis(mr_dict)
def signups(db, end=None, period='hours', start=None): if not end: end = now() end = datetime.datetime.fromtimestamp(end) end = end.replace(minute=0, second=0, microsecond=0) if not start: start = end - pandas.DateOffset(hours=48) else: start = datetime.datetime.fromtimestamp(start) start = start.replace(minute=0, second=0, microsecond=0) if not period is 'hours': start = start.replace(hour=8) hourly = pandas.DateRange(start=start, end=end, offset=pandas.DateOffset(**{period: 1})) start_epoch = datetime_to_int(hourly[0]) end_epoch = datetime_to_int(hourly[-1] + hourly.offset) spec = {'created': {'$gt': start_epoch, '$lt': end_epoch}} def group_data(cursor): contact_times = sorted([datetime.datetime.utcfromtimestamp(c['created']) for c in cursor]) data = pandas.Series(1, contact_times) data = pandas.Series(data.groupby(hourly.asof).sum()) return data.reindex(index=hourly, fill_value=0) cursor = db.contact_log.find(spec, {'created': True}) contacts = group_data(cursor) spec.update({'user_created': {'$exists': True}}) cursor = db.contact_log.find(spec, {'created': True}) created_contacts = group_data(cursor) return { 'index': [time.mktime(x.timetuple()) for x in hourly.tolist()] , 'total': contacts.values.tolist() , 'active': created_contacts.values.tolist() , 'ratio': (created_contacts / contacts).values.tolist() }
def load_time_series(symbol, start_date=None, end_date=None, downsample_days=1): log.info("loading %s for %s to %s" % (symbol, start_date, end_date)) filename = "%s.csv" % symbol if not os.path.exists(filename): fetch.fetch_data(symbol) data = pandas.read_csv(filename, parse_dates=True, index_col=0) data = data.drop(["Open", "High", "Low", "Close", "Volume"], axis=1) data = data.rename(columns={"Adj Close": symbol}) data = data.sort() if data.index[0] > start_date: log.warning("no data for %s before %s" % (symbol, data.index[0])) return None data = data.truncate(before=start_date, after=end_date) log.info("%d rows after truncating" % len(data)) # downsample if necessary if downsample_days > 1: drange = pandas.DateRange(start_date, end_date, offset=downsample_days * datetools.day) grouped = data.groupby(drange.asof) means = grouped.mean() log.info("%d rows after downsampling" % len(means)) return means else: return data
def fetch_price_frame(self,startdate,enddate): dts=ps.DateRange(start=startdate,end=enddate,offset=bday) i=0 dividends=[] prices=[] for dt in dts: irs=InterestRate.objects.filter(date__lte=dt,investment=self).order_by('-date') if len(irs) == 0: return None else: ir=irs[0] prices.append(Decimal(1.0)) dividends.append(ir.annualrate/365) data={ 'price':prices, 'dividend':dividends, } df=ps.DataFrame(data,index=dts) return df
def table_33(): path = os.path.join(data_path, 'Table3.3.data.txt') sep = '\s+' lines = [re.split(sep, l.strip()) for l in open(path)] y_data = [] f_data = [] saw_f = False for line in lines: if line[0] == 'Y': continue elif line[0] == 'F': saw_f = True continue # drop year if saw_f: f_data.extend(line[1:]) else: y_data.extend(line[1:]) y_data = np.array(y_data, dtype=float) f_data = np.array(f_data, dtype=float) dates = pn.DateRange(datetime(1975, 1, 1), periods=len(y_data), timeRule='Q@MAR') Y = pn.Series(y_data, index=dates) F = pn.Series(f_data, index=dates) return Y, F
def download_hist(sleept=10): """ download historical data from 1999/01/31 to 2014/05/31 """ for i in pd.DateRange(dt.datetime(1999,1,1), dt.datetime(2014,6,1), offset=pd.datetools.MonthEnd()): download(i) time.sleep(sleept)
def sse_hist(start=dt.datetime(2006, 8, 7), end=dt.datetime.today()): fail_list = [] for date in pd.DateRange(start, end): print date fail_list.append(sse_lhb(date)) time.sleep(10) print[i for i in fail_list if i]
def _trade_dates(dt_start, dt_end, s_period): ''' @summary: Generate dates on which we need to trade @param c_strat: Strategy config class @param dt_start: Start date @param dt_end: End date ''' ldt_timestamps = getNYSEdays(dt_start, dt_end, dt.timedelta(hours=16) ) # Use pandas reindex method instead # Note, dates are index as well as values, we select based on index # but return values since it is a numpy array of datetimes instead of # pandas specific. ts_dates = pd.TimeSeries(index=ldt_timestamps, data=ldt_timestamps) # These are the dates we want if s_period[:2] == 'BW': # special case for biweekly dr_range = pd.DateRange(dt_start, dt_end, timeRule=s_period[1:]) dr_range = np.asarray(dr_range) li_even = np.array(range(len(dr_range))) dr_range = dr_range[li_even[li_even % 2 == 0]] else: dr_range = pd.DateRange(dt_start, dt_end, timeRule=s_period) dr_range = np.asarray(dr_range) # Warning, we MUST copy the date range, if we modify it it will be returned # in it's modified form the next time we use it. dr_range = np.copy(dr_range) dr_range += pd.DateOffset(hours=16) ts_dates = ts_dates.reindex( dr_range, method='bfill' ) ldt_dates = ts_dates[ts_dates.notnull()].values #Make unique sdt_unique = set() ldt_dates = [x for x in ldt_dates if x not in sdt_unique and not sdt_unique.add(x)] return ldt_dates
def download_hist(sleept=10): """ download historical data from 2014-06-01 """ for i in pd.DateRange(dt.datetime(2014, 6, 1), dt.datetime.today(), offset=pd.datetools.MonthEnd()): downloader_new(i) time.sleep(sleept)
def discrete_boxcar_average(self, seconds=1): """Computes a discrete boxcar average for the DataFrame""" date_range = pandas.DateRange(self.data.index[0], self.data.index[-1], offset=pandas.datetools.Second(seconds)) grouped = self.data.groupby(date_range.asof) subsampled = grouped.mean() return LightCurve(subsampled, self.header.copy())
def collect_year(self, year=None, station_name=None, exact_station = False, location_WMO=None, location_WBAN=None, country=None, state=None, internet_connected = True): """ Process a request for data for a given year at a given location optionaly. Inputs: - year, int. If no year is passed, choose the current one. - station_name, str. (Part of) Name of the station to collect data at. The station names are search for in the ish-history txt file stored in self.location_db. - exact_station, bool. If false, all station names are search and the ones containing the string station_name are selected. - location WMO code and/or WBAN code, int, int. If no location is selected, collect the yearly data for all locations. Output: - pandas data structure: 2D (DataFrame) if only one location is requested, 3D (panel) if multiple locations are requested """ if year is None: year = datetime.datetime.today().year warnings.warn("No year was provided: using the current one (%s)" % year) no_location = (location_WMO is None and location_WBAN is None and station_name is None and country is None and state is None) if no_location: # Requested all data for the year that is at all locations. Returns # a panel if it can fit in memory, and None if not. In the latter # case, the data files are still stored locally. return collect_year(year) else: filtered = search_station(self.location_db, self.location_dict, station_name, exact_station, location_WMO, location_WBAN, country, state) if len(filtered) == 1: result = collect_year_at_loc(year, location_WMO = filtered['USAF'][0], location_WBAN = filtered['WBAN'][0], internet_connected = internet_connected) else: data = {} for layer in filtered: df = collect_year_at_loc(year, layer['USAF'], layer['WBAN'], internet_connected = internet_connected) # reindex over the entire year in case there are missing values if df is None: continue df = df.reindex(pandas.DateRange(start = '1/1/%s' % year, end = '31/12/%s' % year, offset = pandas.datetools.day)) key = "%s-%s" % (layer['USAF'], layer['WBAN']) data[key] = df result = pandas.Panel(data) return result
def dataframe(self): bs = self.budgets ts = self.timeseries if len(ts) == 0: return None for c in self.child.all(): cts = c.timeseries if cts: ts = ts.combine(cts, np.sum, 0) if bs: bts = bs[0].timeseries if len(bs) > 1: for b in range(1, len(bs)): bts = bts.combine(b.timeseries, np.sum, 0) else: bts = ps.TimeSeries([0], index=[ts.index[0]]) startdate = min(ts.index[0], bts.index[0]) enddate = max(ts.index[-1], bts.index[-1]) dates = ps.DateRange(startdate, enddate, offset=ps.DateOffset(days=1)) bts = bts.reindex(dates) ts = ts.reindex(dates) df = ps.DataFrame({'actual': ts, 'budget': bts}) df = df.fillna(0) df['actual'] = df['actual'].apply(Decimal) df['budget'] = df['budget'].apply(Decimal) df['vsbudget'] = (df['actual'] - df['budget']).apply(Decimal) types = [] accounts = [] depth = [] for dt in df.index: types.append(self.account_type) accounts.append(self.guid) depth.append(self.depth) edf = ps.DataFrame({ 'type': types, 'account': accounts, 'depth': depth }, index=df.index) df['type'] = edf['type'] df['account'] = edf['account'] df['depth'] = edf['depth'] return df
def _generateCashflow(self): monthly_offset = 12 / self.frequency #offset = int(365.2425/self.frequency) dr = pd.DateRange( self.startdate, self.maturitydate, offset=pdt.DateOffset(months=monthly_offset)) #bday * offset) cf = Cashflow(cf_times=dr, cf_amounts=self.couponrate * 100 / self.frequency) cf.cf[0] = -self.price cf.cf[cf.cf.count() - 1] += 100.0 return cf
def contacts_per_day(db, end=now()): end = datetime.datetime.fromtimestamp(end) end = end.replace(hour=12, minute=0, second=0, microsecond=0) hourly = pandas.DateRange(end=end, offset=pandas.DateOffset(hours=24), periods=120) contacts = db.contact_log.find({'created':{'$gt': time.mktime(hourly[0].timetuple())}}, {'created': True}) contact_times = sorted([datetime.datetime.utcfromtimestamp(c['created']) for c in contacts]) data = pandas.Series(1, contact_times) data = pandas.Series(data.groupby(hourly.asof).sum()) return { 'times': [time.mktime(x.timetuple()) for x in data.index.tolist()] , 'values': data.values.tolist() }
def parse_lutkepohl_data(path): # pragma: no cover """ Parse data files from Lutkepohl (2005) book Source for data files: www.jmulti.de """ from collections import deque from datetime import datetime import pandas import pandas.core.datetools as dt import re regex = re.compile('<(.*) (\w)([\d]+)>.*') lines = deque(open(path)) to_skip = 0 while '*/' not in lines.popleft(): to_skip += 1 while True: to_skip += 1 line = lines.popleft() m = regex.match(line) if m: year, freq, start_point = m.groups() break data = np.genfromtxt(path, names=True, skip_header=to_skip+1) n = len(data) # generate the corresponding date range (using pandas for now) start_point = int(start_point) year = int(year) offsets = { 'Q' : dt.BQuarterEnd(), 'M' : dt.BMonthEnd(), 'A' : dt.BYearEnd() } # create an instance offset = offsets[freq] inc = offset * (start_point - 1) start_date = offset.rollforward(datetime(year, 1, 1)) + inc offset = offsets[freq] date_range = pandas.DateRange(start_date, offset=offset, periods=n) return data, date_range
def _execute(self, period): def actions_per_user_per_day(): map1 = """ function() { date = new Date((this.created - 12*3600) * 1000); day = Date.UTC(date.getFullYear(), date.getMonth(), date.getDate(), 12); emit({name: this.user_name, date: day/1000}, 1); }""" reduce = """ function(key, values) { var total=0; for (var i=0; i < values.length; i++) { total += values[i]; } return total; }""" mr1_name = 'mr.actions_per_user_per_day' mr1 = self.db.mdb[mr1_name] latest = mr1.find_one(sort=[('_id.date', -1)])['_id']['date'] # The following line performs incremental map reduce, but depends on mongodb version >= 1.8 return self.db.ActionLog._col.map_reduce( map1, reduce, mr1_name, merge_output=True, query={'created': { '$gt': latest - 24 * 3600 }}) mr_col = actions_per_user_per_day() mr_col.ensure_index('_id.date') offset = pandas.DateOffset(days=period) start = newhive.utils.time_u( mr_col.find_one(sort=[('_id.date', 1)])['_id']['date']) index = pandas.DateRange(start=start + offset, end=datetime.datetime.now(), offset=pandas.DateOffset(days=1)) def users_active_on(date): cursor = mr_col.find( {'_id.date': dates_to_spec(date - offset, date)}) return len(cursor.distinct('_id.name')) data = pandas.DataFrame( index=index, data={'Active{}'.format(period): index.map(users_active_on)}) return data
def downloadBTCDateRange(start, end): for d in pd.DateRange(start, end): addDateToPayload(d) req2 = s.post(BTCurl, headers=headers, data=payload) res = req2.headers.get('content-disposition') if pd.isnull(res): print "no data on {}".format(d) elif 'filename' in res: filename = res.split('=')[1] print filename filepath = '{}/BTC-OpenPositions/{}'.format(outdir, filename) with open(filepath, "wb") as f: f.write(req2.content) # pause 5 secs pause()
def foo(): path = os.path.join(data_path, 'Table11.1.data.txt') sep = '\s+' lines = [re.split(sep, l.strip()) for l in open(path)] datad = {} for start in [0]: name = lines[start][0] time_rule = lines[start + 1][0] start_date = lines[start + 2][0] data = np.concatenate(lines[start + 3:start + 9]).astype(float) dates = pn.DateRange(start_date, periods=len(data), timeRule=time_rule) datad[name] = pn.Series(data, index=dates) return pn.DataFrame(datad)
def parse_table_22(): path = os.path.join(data_path, 'Table2.2.data.txt') sep = '\s+' lines = [re.split(sep, l.strip()) for l in open(path)] data = [] for line in lines: # drop year data.extend(line[1:]) data = np.array(data, dtype=float) / 100 dates = pn.DateRange(datetime(1975, 1, 1), periods=len(data), timeRule='EOM') return pn.Series(data, index=dates)
def budgetpanel(self, startdate, enddate): analysis_dates = ps.DateRange(startdate, enddate, offset=ps.DateOffset(days=1)) data = {} for a in self: if type(a) != str: df = a.dataframe if df is not None: df = df.reindex(analysis_dates) data[a] = df p = ps.Panel(data, major_axis=analysis_dates) return p
def loadfrets(model, start_date, stop_date): ''' load axioma factor returns data over a given window Parameters: model : str name of axioma risk model start_date : datetime start date of factor returns stop_date : datetime stop date of factor returns returns DataFrame ''' tmp = pandas.DataFrame() for dt in pandas.DateRange(start_date, stop_date): _tmp = loadfret(model, dt) tmp = tmp.append(_tmp, ignore_index=True) return tmp
def backfill_b_alphas2(startdate=datetime.datetime(2005, 1, 1), enddate=datetime.datetime.today() - pandas.datetools.day, ncpus=8): """ this function is to backfill the bucket signals defined in the production env for 1 day. """ print "going to generate production def. bucket alphas" job_server = pp.Server(ncpus) jobs = [] for date in pandas.DateRange(startdate, enddate, offset=pandas.datetools.day): jobs.append( job_server.submit(backfill_b_1d, (date, ), (), ('pandas', 'datetime'))) for job in jobs: job() job_server.print_stats() job_server.destroy()
def create_full_record(p_series_list): '''tiles records together creates a daily record over the minimum to maximum dates in the series list ''' #--find the min and max dates min_date, max_date = datetime(year=3012, month=1, day=1), datetime(year=1512, month=1, day=1) for p in p_series_list: if p.index.min() < min_date: min_date = p.index[0] if p.index.max() > max_date: max_date = p.index[-1] #print min_date,max_date #--create new pandas date range inclusive of the whole record d_range = pandas.DateRange(start=min_date, end=max_date, offset=pandas.core.datetools.day) full_series = pandas.TimeSeries(np.ones(len(d_range)) * np.nan, d_range) #print d_range for dt, val in full_series.iteritems(): #--try to find an entry in one of the series for this day v = np.nan for p in p_series_list: #print p.head() try: if p[dt] != np.nan: v = p[dt] except: pass if v != np.nan: full_series[dt] = v #break return full_series
def mdf_evalto(self, parameter_s=""): """ Advances the current context to the end date and return a pandas dataframe of nodes evaluated on each timestep. %mdf_evalto <end_date> [nodes...] eg: %mdf_evalto 2020-01-01 <my node 1> <my node 2> """ args = tokenize(parameter_s) cur_ctx = _get_current_context() root_ctx = cur_ctx.get_parent() or cur_ctx end_date, nodes = args[0], args[1:] end_date = _parse_datetime(end_date, self.shell.user_global_ns, self.shell.user_ns) nodes = map( lambda x: eval(x, self.shell.user_global_ns, self.shell.user_ns), nodes) df_ctx = root_ctx if len(nodes) > 0 and isinstance(nodes[-1], (dict, list, tuple)): shift_sets = _get_shift_sets(args[-1], nodes.pop()) assert len( shift_sets) <= 1, "Only one shift set allowed for %mdf_evalto" if shift_sets: unused, shift_set = shift_sets[0] df_ctx = df_ctx.shift(shift_set=shift_set) df_builder = DataFrameBuilder(nodes, filter=True) date_range = pd.DateRange(cur_ctx.get_date(), end_date, offset=self.__timestep) for dt in date_range: root_ctx.set_date(dt) df_builder(dt, df_ctx) return df_builder.get_dataframe(df_ctx)
def downloadBTCDateRange(start, end): fns = [] for d in pd.DateRange(start, end): addDateToPayload(d) req2 = s.post(BTCurl, headers=headers, data=payload) res = req2.headers.get('content-disposition') if pd.isnull(res): print "no data on {}".format(d) elif 'filename' in res: filename = res.split('=')[1] print filename filepath = '{}/BTC-OpenPositions/{}'.format(outdir, filename) foldern = outdir + '/BTC-OpenPositions' if not os.path.exists(foldern): os.makedirs(foldern) print filepath with open(filepath, 'wb') as f: f.write(req2.content) # pause 5 secs fns.append(filepath) time.sleep(5) return fns
def main(startdate=None, enddate=None, sleeptime=0): """ """ if startdate is None: startdate = dt.datetime.today() if enddate is None: enddate = dt.datetime.today() for date in pd.DateRange(startdate, enddate, offset=pd.datetools.day): print date.strftime('%Y-%m-%d') fn = download_report(date) df = parser(fn) sql_del = """delete from %(table)s where datadate='%(date)s'""" % { 'table': TABLE, 'date': date.strftime('%Y-%m-%d') } dbo.cursor.execute(sql_del) dbo.commit() print "len(df) = %s" % len(df) print "start to upload to table %s" % TABLE sql_io.write_frame(df, TABLE, if_exists='append', bulk='off') time.sleep(sleeptime)
def run(start_date, end_date=None, forceall=False): if end_date is None: end_date = start_date bd_list = pd.DateRange(start_date, end_date, offset=pd.datetools.bday) d_str_l = [re.search('\d+', i).group() for i in os.listdir(output_dir)] for bd in bd_list: print bd d_str = bd.strftime('%Y%m%d') if (not forceall) and (d_str in d_str_l): continue c1 = requests.get(url_base % d_str).content if '\xe4\xbd\x8d\xe7\xbd\xae\xe5\xb7\xb2\xe6\x9b\xb4\xe6\x94\xb9' in c1: continue with open(output_dir + '/DTOP_O_%s.zip' % d_str, 'w') as f: f.write(c1) time.sleep(60)
def _magic_dataframe(self, parameter_s, widepanel=False, single_df=True): """Implementation for magic_dataframe and magic_widepanel""" # the first two arguments are dates, and after that it's a list of nodes # with some optional keyword args, ie %mdf_df <start> <end> node, node, node, shifts=[{x:1}, {x:2}] args = arg_names = tokenize(parameter_s) args = [ _try_eval(x, self.shell.user_global_ns, self.shell.user_ns) for x in args ] args = list(zip(arg_names, args)) start = None if len(args) > 0: arg_name, arg = args.pop(0) start = _parse_datetime(arg_name, self.shell.user_global_ns, self.shell.user_ns) end = None if len(args) > 0: arg_name, arg = args.pop(0) end = _parse_datetime(arg_name, self.shell.user_global_ns, self.shell.user_ns) # the final argument can be the number of processes to use num_processes = 0 if len(args) > 0: arg_name, arg = args[-1] if isinstance(arg, basestring) and arg.startswith("||"): arg_name, arg = args.pop() num_processes = int(arg[2:]) # the next to last parameter may be a shift set or list of # shift sets. has_shifts = False shift_sets = [{}] # always have at least one empty shift set shift_names = ["_0"] arg_name, arg = args[-1] if len(args) > 0 else (None, None) if not isinstance(arg, MDFNode): arg_name, arg = args.pop() named_shift_sets = _get_shift_sets(arg_name, arg) if named_shift_sets: shift_names, shift_sets = zip(*named_shift_sets) has_shifts = True # any remaining arguments are the nodes nodes = [] node_var_names = [] for arg_name, node in args: assert isinstance(node, MDFNode), "%s is not a node" % arg_name nodes.append(node) node_var_names.append(arg_name) curr_ctx = _get_current_context() ctxs = [None] * len(nodes) if not nodes: # get the selected nodes from the viewer if _viewer_imported: selected = viewer.get_selected() ctxs, nodes = zip(*selected) for i, (ctx, node) in enumerate(selected): assert ctx.is_shift_of(curr_ctx), \ "selected node '%s' is not in the current context" % node.name # replace any contexts that are simply the current context with None # so that shifting works correctly if ctx is curr_ctx: ctxs[i] = None # if there are shifts then all the contexts have to be None otherwise the # shifts won't work correctly. This could be relaxed later if it causes problems, # but for now this makes the code simpler. if has_shifts: assert np.array([x is None for x in ctxs]).all(), \ "Can't apply shifts when contexts are explicitly specified" # list df_builders, one per node or group of nodes callbacks = [] df_builders = [] if widepanel or not single_df: # build multiple dataframes for node, ctx in zip(nodes, ctxs): if ctx is None: df_builder = DataFrameBuilder([node], filter=True) else: df_builder = DataFrameBuilder([node], contexts=[ctx], filter=True) df_builders.append(df_builder) else: # build a single dataframe if np.array([x is None for x in ctxs]).all(): df_builder = DataFrameBuilder(nodes, filter=True) else: df_builder = DataFrameBuilder(nodes, contexts=ctxs, filter=True) df_builders.append(df_builder) # add all the dataframe builders to the callbacks callbacks.extend(df_builders) root_ctx = curr_ctx.get_parent() or curr_ctx date_range = pd.DateRange(start, end, offset=self.__timestep) # Add a progress bar to the callbacks callbacks.append(ProgressBar(date_range[0], date_range[-1])) shifted_ctxs = run(date_range, callbacks, ctx=root_ctx, shifts=shift_sets, num_processes=num_processes) if not has_shifts: shifted_ctxs = [root_ctx] # when returning a list of results because multiple shifts have been specified # use a named tuple with the items being the names of the shifts tuple_ctr = tuple if has_shifts: # Currying hell yeah tuple_ctr = partial(ShiftedResultsTuple, shift_names) if widepanel: wps = [] for shift_name, shift_set, shifted_ctx in zip( shift_names, shift_sets, shifted_ctxs): wp_dict = {} for node_var_name, df_builder in zip(node_var_names, df_builders): wp_dict[node_var_name] = df_builder.get_dataframe( shifted_ctx) wp = pd.WidePanel.from_dict(wp_dict) if has_shifts: wp = WidePanelWithShiftSet(wp, shift_name, shift_set) wps.append(wp) if len(wps) == 1: return wps[0] return tuple_ctr(*wps) # list a list of lists of dataframes # [[dfs for one shift set], [dfs for next shift set], ...] df_lists = [] for shift_name, shift_set, shifted_ctx in zip(shift_names, shift_sets, shifted_ctxs): dfs = [] for df_builder in df_builders: df = df_builder.get_dataframe(shifted_ctx) if has_shifts: df = DataFrameWithShiftSet(df, shift_name, shift_set) dfs.append(df) df_lists.append(dfs) if single_df: # flatten into a single list (there should be one dataframe per shift) dfs = reduce(operator.add, df_lists, []) if len(dfs) == 1: return dfs[0] return tuple_ctr(*dfs) if len(df_lists) == 1: return df_lists[0] return tuple_ctr(*df_lists)
# mod_tb = tb.lpc(y, 2) # t_end = timer() # print str(t_end - t) + " seconds for talkbox.lpc" # print """For higher lag lengths ours quickly fills up memory and starts #thrashing the swap. Should we include talkbox C code or Cythonize the #Levinson recursion algorithm?""" ## Try with a pandas series import pandas import scikits.timeseries as ts d1 = ts.Date(year=1700, freq='A') #NOTE: have to have yearBegin offset for annual data until parser rewrite #should this be up to the user, or should it be done in TSM init? #NOTE: not anymore, it's end of year now ts_dr = ts.date_array(start_date=d1, length=len(sunspots.endog)) pandas_dr = pandas.DateRange(start=d1.datetime, periods=len(sunspots.endog), timeRule='A@DEC') #pandas_dr = pandas_dr.shift(-1, pandas.datetools.yearBegin) dates = np.arange(1700, 1700 + len(sunspots.endog)) dates = ts.date_array(dates, freq='A') #sunspots = pandas.Series(sunspots.endog, index=dates) #NOTE: pandas only does business days for dates it looks like import datetime dt_dates = np.asarray(lmap(datetime.datetime.fromordinal, ts_dr.toordinal().astype(int))) sunspots = pandas.Series(sunspots.endog, index=dt_dates) #NOTE: pandas can't handle pre-1900 dates mod = AR(sunspots, freq='A') res = mod.fit(method='mle', maxlag=9)