def load(self,dataset,report_func=None): """Load dataset from cache file""" options = self._options if not dataset or not options.switch_cache or not exists(dataset): return # get profile from timeseries file if exists profile = self._get_file_profile(dataset,options.profile) _accounting['cache.load.profile'] = profile d = options.cache_date if not d: d = datetime.today().isoformat()[:10] c = self._make_cachedir() f = self.filename(dataset) _accounting['cache.load.file'] = f # logger.debug('trying pickling results from %s',f) result = None if exists(f): # logger.debug('pickle file %s exists',f) result = DataSet() p = pickle.load(open(f,'r')) for k,v in p.items(): if k=="_MISSING": result.add_missing(*v) else: if v[0]==0: # Timeseries result[k]=Timeseries(data=v[1],metadata=v[2],name=k) elif v[0]==1: # Numpy Array result[k]=v[1] _accounting['cache.load.missing'] = ','.join(result.missing) _accounting['cache.load.series'] = ','.join(result.keys()) # Report if report_func: # logger.debug(dictview(_accounting)) report_func("load",f,result,self,_accounting) _accounting.clear() else: # logger.debug('no pickle file %s',f) if options.cache_date: logger.error('Requested date (%s) does not exists',d) sys.exit(-1) return result
def __init__(self,name,profile=None,options=Options()): counter = options.counter # logger.debug('CLASS=%s,COUNTER=%s',self.__class__.__name__, counter) self._options = options self.name = name self._retries = 3 # TODO: to get from options # tuple extracting from urlparse self.requesting = [] # get data values self.get_values = None # The Information Set self._res = DataSet() # Missing Variables self._missing = [] # Connection Profile self._profile=profile # Options for request self._opt_request_req = False self._opt_delete_base_kvars = False self._opt_discipline_inline_function_single = True # Base parameters for data providers self._append_param('LASTYEAR','(($THISYEAR-$YUPD))') self._append_param('PREVYEAR','(($LASTYEAR-1))') # Password violation (dont make other requests) self._password_violation = False
def execute(self): logger.debug('begin') _newds = self._dataset ### Proxy if any # proxy_info = get_proxy() if proxy_info: pip={ 'http':proxy_info['proxy'], 'https':proxy_info['proxy'], } proxy = urllib2.ProxyHandler(pip) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) page = urllib2.urlopen(self._url) logger.debug('Got %s',self._url) soup = BeautifulSoup(page) ### Questo blocco cerca la tabella tab2a tTags = soup.findAll('table') found = False for i,tTag in enumerate(tTags): #print '====',i,str(tTag.tr.td.text)[0:200] if re.match('^Table %s' % self._table,str(tTag.tr.td.text),re.I): found=True break # for tTag in tTags: #tA = tTd.findNext('a') #tAv = tTag.find(attrs={'name':self._table}) #if tAv: # found=True # break # print "--------------------" if not found: logger.error('Table %s not in page',self._table) return raise ValueError, "%s not found" % self._table tTrs = tTag.findAll('tr') # print ">>>>>>>>" i = 0 inBlock = False res = [] for tTr in tTrs: tTds = tTr.findAll('td') v = [] for i,td in enumerate(tTds): txt = td.text if txt != ' ': if re.match('^--$',txt): txt = '' elif re.match(',',txt): txt = txt.replace(',','') elif i==0 and re.match('^.+([0-9]\/?)$',txt): m = re.match('^.+([0-9]\/?)$',txt) m1 = m.group(1) txt = txt[:-len(m1)] elif re.match('^\*.+$',txt): while txt[0]!='*': txt = txt[1:] v.append(txt) res.append(v) i += 1 titles = [ 'Stand-by Arrangements', 'Extended Arrangements', 'Flexible Credit Line', 'Precautionary' ] ttl = titles.pop(0) lk = -1 v = {} t = {} for l in res: if len(l)==1 and re.match("^%s"%ttl,l[0],re.I): lk += 1 if len(titles)>0: ttl = titles.pop(0) logger.debug('Next Title %s',ttl) elif len(l)==7: if not v.has_key(lk): v[lk]=[] v[lk].append(l) elif len(l)==4 and re.search('Arrangements?$',l[0]): m = re.match('^([0-9]+) Arrangements?$',l[0]) if m: npx = m.group(1) npt = False if not t.has_key(lk): t[lk]=[] if len(v[lk])==int(npx): logger.debug("Wow right arrangement found") npt=True l.append(npx) l.append(npt) t[lk].extend(l) else: if len(l)>0: logger.debug('LINE NOT INSERTED %d > %s', len(l), l) N = np.sum([ int(x[4]) for k,x in t.items()]) # print "ROWS ARE=",N # Make the result vectors # _ds = DataSet() l = 0 for k,w in sorted(v.items()): for i,n in enumerate( ('MEMBER', 'EFDATE', 'EXDATE', 'AGREED', 'UNDRAWN', 'OUTSTANDING', 'POQ', ) ): name = "%s_%d" % (n,k) if not _ds.has_key(name): _ds[name]=[] for m in w: # print "WL=",m if n=='POQ' and len(m[i])>0: _ds[name].append(m[i]+"\\%") else: _ds[name].append(m[i].lstrip('*')) l += 1 # pprint(_ds) return _ds
class DataProvider(object): """ General access class to Data Providers with URI notation like: * ``dstream://Datastream/AGINDEMIF`` o * ``flinp://DB22/ECB_STS1/M.BG.N.PROD.NS0020.4.000?name=IPBUL&start=$NOW-24M&end=$NOW&proc=weighted_avg&(IPBUL2000AVG)check`` """ # library functions _func = { 'ISO2' : { 'def' : ndc.get_country_alpha2, 'args' : ( 'code', ) }, 'ISO3' : { 'def' : ndc.get_country_alpha3, 'args' : ( 'code', ) }, 'ISON' : { 'def' : ndc.get_country_numeric, 'args' : ( 'code', ) }, 'ISOAREA': { 'def' : ndc.get_area_name, 'args' : ( 'code', ) }, 'ISOREGION': { 'def' : ndc.get_region_name, 'args' : ( 'code', ) } } def __init__(self,name,profile=None,options=Options()): counter = options.counter # logger.debug('CLASS=%s,COUNTER=%s',self.__class__.__name__, counter) self._options = options self.name = name self._retries = 3 # TODO: to get from options # tuple extracting from urlparse self.requesting = [] # get data values self.get_values = None # The Information Set self._res = DataSet() # Missing Variables self._missing = [] # Connection Profile self._profile=profile # Options for request self._opt_request_req = False self._opt_delete_base_kvars = False self._opt_discipline_inline_function_single = True # Base parameters for data providers self._append_param('LASTYEAR','(($THISYEAR-$YUPD))') self._append_param('PREVYEAR','(($LASTYEAR-1))') # Password violation (dont make other requests) self._password_violation = False def append(self,url): """ Append a data request URL to the list >>> dp = DataProvider() >>> dp.append('dstream://Datastream/AGINDEMIF') >>> dp.append('dstream://Datastream/PCH#(AGINDEMIF,1Y)') >>> dp.append('option://param/TEST?VALUE') """ url=stripcomments(url.rstrip('\n ').upper()) if len(url)==0: return # Needs double parsing because urlparse function # cant parse (?...&...) # if schema is not http: # management of # character URL = url.replace('#','__A~~~~A__') # first pass of parsing ot get scheme up = urlparse(URL) # substitute parameters when scheme is not option (CHECK) if not re.search('^option$',up.scheme,re.IGNORECASE) and '$' in url: url = Template(url).safe_substitute(self._options.define_set) # reset # url = url.replace('#','__A~~~~A__') # ...second urlparse using a fake http: schema h_url = re.sub("^%s://" % up.scheme, "http://" , url,flags=re.I) h_pa = urlparse(h_url) parsed = list(h_pa) parsed[0] = unicode(up.scheme) A = parsed[2] parsed[2] = parsed[2].replace('__A~~~~A__','#') # ?!? parsed[2] = parsed[2].replace('\\','/') # for option URI insert in the option list if re.search('^option$',up.scheme,re.I): self._append_option(parsed) return # logger.debug('%s (%d)', parsed, len(self.requesting)) # otherwise add to requesting list self.requesting.append(parsed) def info(self): s = "" for req in self.requesting: hostname=req[1].upper() series = [_replace_funcs(s,self._func) for s in [req[2][1:],]] # .split('+') xparams = req[4] s += "%s|%s|%s" % (hostname,series,xparams) return s def mk_request(self,sources,serie): """The mk_request transform the request string in the structure understood by the provider driver""" return sources def request(self,profile=None): """ Request data :rtype TimeSeriesResultSet: resultset di timeseries """ global acct if hasattr(self.provider,'open'): self.provider.open() self._res = DataSet() # Base Dataset is empty # accounting stuff acct_l = {} _accounting['datareq.profile']=profile R = udict() for req in self.requesting: basevars = udict({ 'name': None, 'start': None, 'end': None, 'proc': None, 'check': None }) hostname=req[1].upper() series = [ _replace_funcs(s,self._func) for s in [req[2][1:],]] xparams = req[4] #logger.debug('dataprovider requests %s from %s | %s (%s%s)', # ','.join(series),hostname,str(xparams), # 'R' if self._opt_request_req else "S", # 'K' if self._opt_delete_base_kvars else "-" ) _accounting["datareq.series.%s"%','.join(series)]='%s | %s (%s%s)' % ( hostname,str(xparams), 'R' if self._opt_request_req else "S", 'K' if self._opt_delete_base_kvars else "-" ) # Replace self._funcs xparams = _replace_funcs(xparams,self._func) kvars = basevars if len(xparams)>0: kvars2 = udict(parse_qsl(xparams)) accepted(kvars, 'NAME', 'START', 'END', 'PROC', 'CHECK') kvars.update(kvars2) if 'NAME' in kvars: kvars['NAME']=kvars['NAME'].strip() if self._opt_delete_base_kvars: for _k in kvars.keys(): if _k in kvars2: del kvars2[_k] if self._opt_request_req: reqs = req else: reqs = series res = [] if not self._options.only_options: if self._opt_request_req: reqs = self.mk_request(req,hostname) else: reqs = self.mk_request(series,hostname) # if self._options.switch_verbose: # for k,v in kvars.items(): # logger.info('K:%s=%s',k,v) with Timer() as t: res = self.mget(reqs,**kvars) _accounting['%s.request.%s.time' % (self.name,kvars['NAME'])] = t.msecs _accounting['%s.request.%s.req' % (self.name,kvars['NAME'])] = req if res: _accounting['%s.request.%s.res' % (self.name,kvars['NAME'])] = res # acct_l[kvars['NAME']] = (reqs,kvars,res) # if 'provider' not in acct: # acct['provider']={} #acct['provider'][self.name]=acct_l kvars = {} if hasattr(self.provider,'close'): self.provider.close() _accounting['%s.request.missing' % (self.name)] = ','.join(self._missing) self._res.add_missing(*self._missing) return self._res def mget(self,reqs,**kw): for serie in reqs: name = serie if kw['NAME']: name = kw['NAME'] else: kw['NAME'] = serie _ts = None try: # gets data from provider _ts = self.get(serie,**kw) except ValueError, exc: logger.debug('Not saving %s in information set - series missing',serie) # ...save in results if _ts: self._res.update(_ts) # or in missing list else: if kw.has_key('NAME') and kw['NAME'] is not None: self._missing.extend(kw['NAME'].split(',')) else: logger.warn('Anonymous MISSNG found') _accounting['load.inline.processors.discipline']=self._opt_discipline_inline_function_single if self._opt_discipline_inline_function_single==True: # logger.debug("_inline_processor with discipline True") self._res = self._inline_processor(self._res,name,kw) if self._opt_discipline_inline_function_single!=True: # logger.debug("_inline_processor with discipline False") self._res = self._inline_processor(self._res,None,kw) return _ts
def request(self,profile=None): """ Request data :rtype TimeSeriesResultSet: resultset di timeseries """ global acct if hasattr(self.provider,'open'): self.provider.open() self._res = DataSet() # Base Dataset is empty # accounting stuff acct_l = {} _accounting['datareq.profile']=profile R = udict() for req in self.requesting: basevars = udict({ 'name': None, 'start': None, 'end': None, 'proc': None, 'check': None }) hostname=req[1].upper() series = [ _replace_funcs(s,self._func) for s in [req[2][1:],]] xparams = req[4] #logger.debug('dataprovider requests %s from %s | %s (%s%s)', # ','.join(series),hostname,str(xparams), # 'R' if self._opt_request_req else "S", # 'K' if self._opt_delete_base_kvars else "-" ) _accounting["datareq.series.%s"%','.join(series)]='%s | %s (%s%s)' % ( hostname,str(xparams), 'R' if self._opt_request_req else "S", 'K' if self._opt_delete_base_kvars else "-" ) # Replace self._funcs xparams = _replace_funcs(xparams,self._func) kvars = basevars if len(xparams)>0: kvars2 = udict(parse_qsl(xparams)) accepted(kvars, 'NAME', 'START', 'END', 'PROC', 'CHECK') kvars.update(kvars2) if 'NAME' in kvars: kvars['NAME']=kvars['NAME'].strip() if self._opt_delete_base_kvars: for _k in kvars.keys(): if _k in kvars2: del kvars2[_k] if self._opt_request_req: reqs = req else: reqs = series res = [] if not self._options.only_options: if self._opt_request_req: reqs = self.mk_request(req,hostname) else: reqs = self.mk_request(series,hostname) # if self._options.switch_verbose: # for k,v in kvars.items(): # logger.info('K:%s=%s',k,v) with Timer() as t: res = self.mget(reqs,**kvars) _accounting['%s.request.%s.time' % (self.name,kvars['NAME'])] = t.msecs _accounting['%s.request.%s.req' % (self.name,kvars['NAME'])] = req if res: _accounting['%s.request.%s.res' % (self.name,kvars['NAME'])] = res # acct_l[kvars['NAME']] = (reqs,kvars,res) # if 'provider' not in acct: # acct['provider']={} #acct['provider'][self.name]=acct_l kvars = {} if hasattr(self.provider,'close'): self.provider.close() _accounting['%s.request.missing' % (self.name)] = ','.join(self._missing) self._res.add_missing(*self._missing) return self._res