def get_indicators(): '''Download information about all World Bank data series ''' url = 'http://api.worldbank.org/indicators?per_page=50000&format=json' with urlopen(url) as response: data = response.read() data = json.loads(data)[1] data = pandas.DataFrame(data) # Clean fields data.source = [x['value'] for x in data.source] fun = lambda x: x.encode('ascii', 'ignore') data.sourceOrganization = data.sourceOrganization.apply(fun) # Clean topic field def get_value(x): try: return x['value'] except: return '' fun = lambda x: [get_value(y) for y in x] data.topics = data.topics.apply(fun) data.topics = data.topics.apply(lambda x: ' ; '.join(x)) # Clean outpu data = data.sort(columns='id') data.index = pandas.Index(lrange(data.shape[0])) return data
def _read(io): """Try to read from a url, file or string. Parameters ---------- io : str, unicode, or file-like Returns ------- raw_text : str """ if _is_url(io): try: with urlopen(io) as url: raw_text = url.read() except urllib2.URLError: raise ValueError('Invalid URL: "{0}"'.format(io)) elif hasattr(io, "read"): raw_text = io.read() elif os.path.isfile(io): with open(io) as f: raw_text = f.read() elif isinstance(io, basestring): raw_text = io else: raise TypeError("Cannot read object of type " "'{0.__class__.__name__!r}'".format(io)) return raw_text
def get_elements_from_url(url, element='table', base_url="file://"): _skip_if_none_of(('bs4', 'html5lib')) url = "".join([base_url, url]) from bs4 import BeautifulSoup with urlopen(url) as f: soup = BeautifulSoup(f, features='html5lib') return soup.find_all(element)
def _read(io): """Try to read from a url, file or string. Parameters ---------- io : str, unicode, or file-like Returns ------- raw_text : str """ if _is_url(io): with urlopen(io) as url: raw_text = url.read() elif hasattr(io, 'read'): raw_text = io.read() elif os.path.isfile(io): with open(io) as f: raw_text = f.read() elif isinstance(io, compat.string_types): raw_text = io else: raise TypeError("Cannot read object of type " "'{0.__class__.__name__!r}'".format(io)) return raw_text
def dump_as_gist(data, desc="The Commit", njobs=None): host, njobs2 = get_travis_data()[:2] if njobs: # be slightly more reliable njobs = max(njobs, njobs2) content = dict(version="0.1.1", timings=data, datetime=get_utcdatetime(), # added in 0.1.1 hostname=host, # added in 0.1.1 njobs=njobs # added in 0.1.1, a measure of load on the travis box ) payload = dict(description=desc, public=True, files={'results.json': dict(content=json.dumps(content))}) try: with closing(urlopen("https://api.github.com/gists", json.dumps(payload), timeout=WEB_TIMEOUT)) as r: if 200 <= r.getcode() < 300: print("\n\n" + "-" * 80) gist = json.loads(r.read()) file_raw_url = list(gist['files'].items())[0][1]['raw_url'] print("[vbench-gist-raw_url] %s" % file_raw_url) print("[vbench-html-url] %s" % gist['html_url']) print("[vbench-api-url] %s" % gist['url']) print("-" * 80 + "\n\n") else: print("api.github.com returned status %d" % r.getcode()) except: print("Error occured while dumping to gist")
def _read(obj): """Try to read from a url, file or string. Parameters ---------- obj : str, unicode, or file-like Returns ------- raw_text : str """ if _is_url(obj): with urlopen(obj) as url: text = url.read() elif hasattr(obj, 'read'): text = obj.read() elif isinstance(obj, char_types): text = obj try: if os.path.isfile(text): with open(text, 'rb') as f: return f.read() except (TypeError, ValueError): pass else: raise TypeError("Cannot read object of type %r" % type(obj).__name__) return text
def _retry_read_url(url, retry_count, pause, name): """ Open url (and retry) """ for _ in range(retry_count): time.sleep(pause) # kludge to close the socket ASAP try: with urlopen(url) as resp: lines = resp.read() except _network_error_classes: pass else: rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, parse_dates=True, na_values='-')[::-1] # Yahoo! Finance sometimes does this awesome thing where they # return 2 rows for the most recent business day if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover rs = rs[:-1] #Get rid of unicode characters in index name. try: rs.index.name = rs.index.name.decode('unicode_escape').encode('ascii', 'ignore') except AttributeError: #Python 3 string has no decode method. rs.index.name = rs.index.name.encode('ascii', 'ignore').decode() return rs raise IOError("after %d tries, %s did not " "return a 200 for url %r" % (retry_count, name, url))
def get_indicators(): """Download information about all World Bank data series """ url = "http://api.worldbank.org/indicators?per_page=50000&format=json" with urlopen(url) as response: data = response.read() data = json.loads(data)[1] data = pandas.DataFrame(data) # Clean fields data.source = [x["value"] for x in data.source] fun = lambda x: x.encode("ascii", "ignore") data.sourceOrganization = data.sourceOrganization.apply(fun) # Clean topic field def get_value(x): try: return x["value"] except: return "" fun = lambda x: [get_value(y) for y in x] data.topics = data.topics.apply(fun) data.topics = data.topics.apply(lambda x: " ; ".join(x)) # Clean outpu data = data.sort(columns="id") data.index = pandas.Index(lrange(data.shape[0])) return data
def get_elements_from_file(url, element='table'): _skip_if_none_of(('bs4', 'html5lib')) url = file_path_to_url(url) from bs4 import BeautifulSoup with urlopen(url) as f: soup = BeautifulSoup(f, features='html5lib') return soup.find_all(element)
def get_data_famafrench(name): # path of zip files zip_file_path = '{0}/{1}.zip'.format(_FAMAFRENCH_URL, name) with urlopen(zip_file_path) as url: raw = url.read() with tempfile.TemporaryFile() as tmpf: tmpf.write(raw) with ZipFile(tmpf, 'r') as zf: data = zf.open(zf.namelist()[0]).readlines() line_lengths = np.array(lmap(len, data)) file_edges = np.where(line_lengths == 2)[0] datasets = {} edges = zip(file_edges + 1, file_edges[1:]) for i, (left_edge, right_edge) in enumerate(edges): dataset = [d.split() for d in data[left_edge:right_edge]] if len(dataset) > 10: ncol_raw = np.array(lmap(len, dataset)) ncol = np.median(ncol_raw) header_index = np.where(ncol_raw == ncol - 1)[0][-1] header = dataset[header_index] ds_header = dataset[header_index + 1:] # to ensure the header is unique header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header, start=1)] index = np.array([d[0] for d in ds_header], dtype=int) dataset = np.array([d[1:] for d in ds_header], dtype=float) datasets[i] = DataFrame(dataset, index, columns=header) return datasets
def get_dividends_yahoo(sid, start, end): # Taken from get_data_yahoo in Pandas library and adjust a single parameter to get dividends from pandas.compat import StringIO, bytes_to_str from pandas.io.common import urlopen start, end = pd.to_datetime(start), pd.to_datetime(end) url = ('http://ichart.finance.yahoo.com/table.csv?' + 's=%s' % sid + '&a=%s' % (start.month - 1) + '&b=%s' % start.day + '&c=%s' % start.year + '&d=%s' % (end.month - 1) + '&e=%s' % end.day + '&f=%s' % end.year + '&g=v' + # THE CHANGE '&ignore=.csv') with urlopen(url) as resp: lines = resp.read() rs = pd.read_csv(StringIO(bytes_to_str(lines)), index_col=0, parse_dates=True, na_values='-')[::-1] # Yahoo! Finance sometimes does this awesome thing where they # return 2 rows for the most recent business day if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover rs = rs[:-1] return rs
def get_vbench_log(build_url): with urlopen(build_url) as r: if not (200 <= r.getcode() < 300): return s = json.loads(r.read()) s = [x for x in s['matrix'] if "VBENCH" in ((x.get('config', {}) or {}).get('env', {}) or {})] # s=[x for x in s['matrix']] if not s: return id = s[0]['id'] # should be just one for now with urlopen("https://api.travis-ci.org/jobs/%s" % id) as r2: if not 200 <= r.getcode() < 300: return s2 = json.loads(r2.read()) return s2.get('log')
def get_elements_from_file(url, element="table"): _skip_if_none_of(("bs4", "html5lib")) url = file_path_to_url(url) from bs4 import BeautifulSoup with urlopen(url) as f: soup = BeautifulSoup(f, features="html5lib") return soup.find_all(element)
def _get_page(page_number): gh_url = ('https://api.github.com/repos/pydata/pandas/issues?' 'milestone=*&state=closed&assignee=*&page=%d') % page_number with urlopen(gh_url) as resp: rs = resp.readlines()[0] jsondata = json.loads(rs) issues = [Issue(x['title'], x['labels'], x['number'], get_milestone(x['milestone']), x['body'], x['state']) for x in jsondata] return issues
def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country="US", start=2002, end=2005): if type(country) == str: country = [country] countries = ";".join(country) # Build URL for api call url = ( "http://api.worldbank.org/countries/" + countries + "/indicators/" + indicator + "?date=" + str(start) + ":" + str(end) + "&per_page=25000&format=json" ) # Download with urlopen(url) as response: data = response.read() # Check to see if there is a possible problem possible_message = json.loads(data)[0] if "message" in possible_message.keys(): msg = possible_message["message"][0] try: msg = msg["key"].split() + ["\n "] + msg["value"].split() wb_err = " ".join(msg) except: wb_err = "" if "key" in msg.keys(): wb_err = msg["key"] + "\n " if "value" in msg.keys(): wb_err += msg["value"] error_msg = "Problem with a World Bank Query \n %s" return None, error_msg % wb_err if "total" in possible_message.keys(): if possible_message["total"] == 0: return None, "No results from world bank." # Parse JSON file data = json.loads(data)[1] country = [x["country"]["value"] for x in data] iso_code = [x["country"]["id"] for x in data] year = [x["date"] for x in data] value = [x["value"] for x in data] # Prepare output out = pandas.DataFrame([country, iso_code, year, value]).T out.columns = ["country", "iso_code", "year", indicator] return out, "Success"
def fetch_data(url, name): with urlopen(url) as resp: data = read_csv( resp, index_col=0, parse_dates=True, header=None, skiprows=1, names=["DATE", name], na_values="." ) try: return data.truncate(start, end) except KeyError: if data.ix[3].name[7:12] == "Error": raise IOError("Failed to get the data. Check that {0!r} is " "a valid FRED series.".format(name)) raise
def _download_data_famafrench(name): url = "".join([_URL, _URL_PREFIX, name, _URL_SUFFIX]) with urlopen(url) as socket: raw = socket.read() with tempfile.TemporaryFile() as tmpf: tmpf.write(raw) with ZipFile(tmpf, "r") as zf: data = zf.open(zf.namelist()[0]).read().decode() return data
def get_components_yahoo(idx_sym): """ Returns DataFrame containing list of component information for index represented in idx_sym from yahoo. Includes component symbol (ticker), exchange, and name. Parameters ---------- idx_sym : str Stock index symbol Examples: '^DJI' (Dow Jones Industrial Average) '^NYA' (NYSE Composite) '^IXIC' (NASDAQ Composite) See: http://finance.yahoo.com/indices for other index symbols Returns ------- idx_df : DataFrame """ stats = 'snx' # URL of form: # http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv url = ('http://download.finance.yahoo.com/d/quotes.csv?s={0}&f={1}' '&e=.csv&h={2}') idx_mod = idx_sym.replace('^', '@%5E') url_str = url.format(idx_mod, stats, 1) idx_df = DataFrame() mask = [True] comp_idx = 1 # LOOP across component index structure, # break when no new components are found while True in mask: url_str = url.format(idx_mod, stats, comp_idx) with urlopen(url_str) as resp: raw = resp.read() lines = raw.decode('utf-8').strip().strip('"').split('"\r\n"') lines = [line.strip().split('","') for line in lines] temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange']) temp_df = temp_df.drop_duplicates() temp_df = temp_df.set_index('ticker') mask = ~temp_df.index.isin(idx_df.index) comp_idx = comp_idx + 50 idx_df = idx_df.append(temp_df[mask]) return idx_df
def get_countries(): """Query information about countries """ url = "http://api.worldbank.org/countries/?per_page=1000&format=json" with urlopen(url) as response: data = response.read() data = json.loads(data)[1] data = pandas.DataFrame(data) data.adminregion = [x["value"] for x in data.adminregion] data.incomeLevel = [x["value"] for x in data.incomeLevel] data.lendingType = [x["value"] for x in data.lendingType] data.region = [x["value"] for x in data.region] data = data.rename(columns={"id": "iso3c", "iso2Code": "iso2c"}) return data
def get_countries(): '''Query information about countries ''' url = 'http://api.worldbank.org/countries/all?format=json' with urlopen(url) as response: data = response.read() data = json.loads(data)[1] data = pandas.DataFrame(data) data.adminregion = [x['value'] for x in data.adminregion] data.incomeLevel = [x['value'] for x in data.incomeLevel] data.lendingType = [x['value'] for x in data.lendingType] data.region = [x['value'] for x in data.region] data = data.rename(columns={'id': 'iso3c', 'iso2Code': 'iso2c'}) return data
def xsg_data(year=None, month=None, retry_count=3, pause=0.001): """ 获取限售股解禁数据 Parameters -------- year:年份,默认为当前年 month:解禁月份,默认为当前月 retry_count : int, 默认 3 如遇网络等问题重复执行的次数 pause : int, 默认 0 重复请求数据过程中暂停的秒数,防止请求间隔时间太短出现的问题 Return ------ DataFrame code:股票代码 name:名称 date:解禁日期 count:解禁数量(万股) ratio:占总盘比率 """ year = dt.get_year() if year is None else year month = dt.get_month() if month is None else month for _ in range(retry_count): time.sleep(pause) try: with urlopen(rv.XSG_URL%(ct.P_TYPE['http'], ct.DOMAINS['em'], ct.PAGES['emxsg'], year, month)) as resp: lines = resp.read() lines = lines.decode('utf-8') if ct.PY3 else lines except _network_error_classes: pass else: da = lines[3:len(lines)-3] list = [] for row in da.split('","'): list.append([data for data in row.split(',')]) df = pd.DataFrame(list) df = df[[1, 3, 4, 5, 6]] for col in [5, 6]: df[col] = df[col].astype(float) df[5] = df[5]/10000 df[6] = df[6]*100 df[5] = df[5].map(ct.FORMAT) df[6] = df[6].map(ct.FORMAT) df.columns = rv.XSG_COLS return df raise IOError("获取失败,请检查网络和URL")
def convert_json_to_df(results_url): """retrieve json results file from url and return df df contains timings for all successful vbenchmarks """ with urlopen(results_url) as resp: res = json.loads(resp.read()) timings = res.get("timings") if not timings: return res = [x for x in timings if x.get('succeeded')] df = pd.DataFrame(res) df = df.set_index("name") return df
def guba_sina(show_content=False): from pandas.io.common import urlopen try: html = lxml.html.parse(nv.GUBA_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['sina'])) # res = html.xpath('//div[@class=\"topNav\"]/div') # print res # return '' with urlopen(nv.GUBA_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['sina'])) as resp: lines = resp.read() print lines except Exception as er: print str(er) pass
def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country='US', start=2002, end=2005): if type(country) == str: country = [country] countries = ';'.join(country) # Build URL for api call url = ("http://api.worldbank.org/countries/" + countries + "/indicators/" + indicator + "?date=" + str(start) + ":" + str(end) + "&per_page=25000&format=json") # Download with urlopen(url) as response: data = response.read() # Check to see if there is a possible problem possible_message = json.loads(data)[0] if 'message' in possible_message.keys(): msg = possible_message['message'][0] try: msg = msg['key'].split() + ["\n "] + msg['value'].split() wb_err = ' '.join(msg) except: wb_err = "" if 'key' in msg.keys(): wb_err = msg['key'] + "\n " if 'value' in msg.keys(): wb_err += msg['value'] error_msg = "Problem with a World Bank Query \n %s" return None, error_msg % wb_err if 'total' in possible_message.keys(): if possible_message['total'] == 0: return None, "No results from world bank." # Parse JSON file data = json.loads(data)[1] country = [x['country']['value'] for x in data] iso_code = [x['country']['id'] for x in data] year = [x['date'] for x in data] value = [x['value'] for x in data] # Prepare output out = pandas.DataFrame([country, iso_code, year, value]).T out.columns = ['country', 'iso_code', 'year', indicator] return out,"Success"
def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError parser = HTMLParser(recover=True, encoding=self.encoding) try: if is_url(self.io): with urlopen(self.io) as f: r = parse(f, parser=parser) else: # try to parse the input in the simplest way r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass except (UnicodeDecodeError, IOError) as e: # if the input is a blob of html goop if not is_url(self.io): r = fromstring(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass else: raise e else: if not hasattr(r, "text_content"): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r
def _read(obj): if _is_url(obj): with urlopen(obj) as url: text = url.read() elif hasattr(obj, 'read'): text = obj.read() elif isinstance(obj, char_types): text = obj try: if os.path.isfile(text): with open(text, 'rb') as f: return f.read() except (TypeError, ValueError): pass else: raise TypeError("Cannot read object of type %r" % type(obj).__name__) return text
def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError parser = HTMLParser(recover=True, encoding=self.encoding) try: if _is_url(self.io): with urlopen(self.io) as f: r = parse(f, parser=parser) else: # try to parse the input in the simplest way r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass except (UnicodeDecodeError, IOError) as e: # if the input is a blob of html goop if not _is_url(self.io): r = fromstring(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass else: raise e else: if not hasattr(r, 'text_content'): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r
def guba_sina(show_content=False): from pandas.io.common import urlopen try: html = lxml.html.parse(nv.GUBA_SINA_URL % (ct.P_TYPE['http'], ct.DOMAINS['sina'])) # res = html.xpath('//div[@class=\"topNav\"]/div') # print res # return '' with urlopen(nv.GUBA_SINA_URL % (ct.P_TYPE['http'], ct.DOMAINS['sina'])) as resp: lines = resp.read() print lines except Exception as er: print str(er) pass
def _get_data(symbols): """ Get current yahoo quote Returns a DataFrame """ if isinstance(symbols, compat.string_types): sym_list = symbols else: sym_list = '+'.join(symbols) # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm request = ''.join(compat.itervalues(_yahoo_codes)) # code request string header = list(_yahoo_codes.keys()) data = defaultdict(list) params = { 's': sym_list, 'f': request } url = _encode_url(_URL, params) with urlopen(url) as response: lines = response.readlines() def line_gen(lines): for line in lines: yield line.decode('utf-8').strip() for line in csv.reader(line_gen(lines)): for i, field in enumerate(line): if field[-2:] == '%"': v = float(field.strip('"%')) elif field[0] == '"': v = field.strip('"') else: try: v = float(field) except ValueError: v = field data[header[i]].append(v) idx = data.pop('symbol') return DataFrame(data, index=idx)
def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country='US', start=2002, end=2005): # Build URL for api call url = "http://api.worldbank.org/countries/" + country + "/indicators/" + \ indicator + "?date=" + str(start) + ":" + str(end) + "&per_page=25000" + \ "&format=json" # Download with urlopen(url) as response: data = response.read() # Parse JSON file data = json.loads(data)[1] country = [x['country']['value'] for x in data] iso2c = [x['country']['id'] for x in data] year = [x['date'] for x in data] value = [x['value'] for x in data] # Prepare output out = pandas.DataFrame([country, iso2c, year, value]).T return out
def get_hist_data(code=None, start=None, end=None, retry_count=3, pause=0.001): """ 获取个股历史交易记录 Parameters ------ code:string 股票代码 e.g. 600848 start:string 开始日期 format:YYYY-MM-DD 为空时取到API所提供的最早日期数据 end:string 结束日期 format:YYYY-MM-DD 为空时取到最近一个交易日数据 retry_count : int, 默认 3 如遇网络等问题重复执行的次数 Number of times to retry query request. pause : int, 默认 0 重复请求数据过程中暂停的秒数,防止请求间隔时间太短出现的问题 return ------- DataFrame 属性:日期 ,开盘价, 最高价, 收盘价, 最低价, 成交量, 价格变动 ,涨跌幅,5日均价,10日均价,20日均价,5日均量,10日均量,20日均量,换手率 """ if code is None or len(code)!=6: return None symbol = code_to_symbol(code) url = ct.DAY_PRICE_URL%(ct.P_TYPE['http'],ct.DOMAINS['ifeng'],symbol) for _ in range(retry_count): time.sleep(pause) try: with urlopen(url) as resp: lines = resp.read() except _network_error_classes: pass else: js = json.loads(lines) df = pd.DataFrame(js['record'],columns=ct.DAY_PRICE_COLUMNS) df = df.applymap(lambda x: x.replace(u',', u''))#删除千位分隔符, df = df.drop('price_change',axis=1) df = df.set_index(['date']) if start is not None: df = df.ix[df.index>=start] if end is not None: df = df.ix[df.index<=end] return df raise IOError("%s获取失败,请检查网络和URL:%s" % (code, url))
def get_countries(): '''Query information about countries Provides information such as: country code, region, income level, capital city, latitude and longitude ''' url = 'http://api.worldbank.org/countries/?per_page=1000&format=json' with urlopen(url) as response: data = response.read() data = json.loads(data)[1] data = pandas.DataFrame(data) data.adminregion = [x['value'] for x in data.adminregion] data.incomeLevel = [x['value'] for x in data.incomeLevel] data.lendingType = [x['value'] for x in data.lendingType] data.region = [x['value'] for x in data.region] data.latitude = [float(x) if x != "" else np.nan for x in data.latitude] data.longitude = [float(x) if x != "" else np.nan for x in data.longitude] data = data.rename(columns={'id': 'iso3c', 'iso2Code': 'iso2c'}) return data
def _holding_cotent(start, end, pageNo, retry_count, pause): url = rv.FUND_HOLDS_URL % (ct.P_TYPE['http'], ct.DOMAINS['163'], ct.PAGES['163fh'], ct.PAGES['163fh'], pageNo, start, end, _random(5)) for _ in range(retry_count): time.sleep(pause) if pageNo > 0: print rv.DP_MSG % pageNo try: with urlopen(url) as resp: lines = resp.read() lines = lines.replace('--', '0') lines = json.loads(lines) data = lines['list'] df = pd.DataFrame(data) df = df.drop([ 'CODE', 'ESYMBOL', 'EXCHANGE', 'NAME', 'RN', 'SHANGQIGUSHU', 'SHANGQISHIZHI', 'SHANGQISHULIANG' ], axis=1) for col in ['GUSHU', 'GUSHUBIJIAO', 'SHIZHI', 'SCSTC27']: df[col] = df[col].astype(float) df['SCSTC27'] = df['SCSTC27'] * 100 df['GUSHU'] = df['GUSHU'] / 10000 df['GUSHUBIJIAO'] = df['GUSHUBIJIAO'] / 10000 df['SHIZHI'] = df['SHIZHI'] / 10000 df['GUSHU'] = df['GUSHU'].map(ct.FORMAT) df['GUSHUBIJIAO'] = df['GUSHUBIJIAO'].map(ct.FORMAT) df['SHIZHI'] = df['SHIZHI'].map(ct.FORMAT) df['SCSTC27'] = df['SCSTC27'].map(ct.FORMAT) df.columns = rv.FUND_HOLDS_COLS df = df[[ 'code', 'name', 'date', 'nums', 'nlast', 'count', 'clast', 'amount', 'ratio' ]] except _network_error_classes: pass else: if pageNo == 0: return df, int(lines['pagecount']) else: return df raise IOError("获取失败,请检查网络和URL:%s" % url)
def _retry_read_url(url, retry_count, pause, name): for _ in range(retry_count): time.sleep(pause) # kludge to close the socket ASAP try: with urlopen(url) as resp: lines = resp.read() except _network_error_classes: pass else: rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, parse_dates=True)[::-1] # Yahoo! Finance sometimes does this awesome thing where they # return 2 rows for the most recent business day if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover rs = rs[:-1] return rs raise IOError("after %d tries, %s did not " "return a 200 for url %r" % (retry_count, name, url))
def get_countries(): """Query information about countries Provides information such as: country code, region, income level, capital city, latitude and longitude """ url = "http://api.worldbank.org/countries/?per_page=1000&format=json" with urlopen(url) as response: data = response.read() data = json.loads(data)[1] data = pandas.DataFrame(data) data.adminregion = [x["value"] for x in data.adminregion] data.incomeLevel = [x["value"] for x in data.incomeLevel] data.lendingType = [x["value"] for x in data.lendingType] data.region = [x["value"] for x in data.region] data.latitude = [float(x) if x != "" else np.nan for x in data.latitude] data.longitude = [float(x) if x != "" else np.nan for x in data.longitude] data = data.rename(columns={"id": "iso3c", "iso2Code": "iso2c"}) return data
def __init__(self, filepath_or_buffer): # If filepath_or_buffer is a url, load the data into a BytesIO if _is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer elif hasattr(filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too filepath_or_buffer.seek(0) self.book = self.load_workbook(filepath_or_buffer) elif isinstance(filepath_or_buffer, str): self.book = self.load_workbook(filepath_or_buffer) else: raise ValueError( "Must explicitly set engine if not passing in buffer or path for io." )
def guba_sina(show_content=False): """ 获取sina财经股吧首页的重点消息 Parameter -------- show_content:是否显示内容,默认False Return -------- DataFrame title, 消息标题 content, 消息内容(show_content=True的情况下) ptime, 发布时间 rcounts,阅读次数 """ from pandas.io.common import urlopen try: with urlopen(nv.GUBA_SINA_URL % (ct.P_TYPE['http'], ct.DOMAINS['sina'])) as resp: lines = resp.read() html = lxml.html.document_fromstring(lines) res = html.xpath('//ul[@class=\"list_05\"]/li') heads = html.xpath('//div[@class=\"tit_04\"]') data = [] for head in heads[:1]: title = head.xpath('a/text()')[0] url = head.xpath('a/@href')[0] ds = [title] ds.extend(_guba_content(url)) data.append(ds) for row in res: title = row.xpath('a[2]/text()')[0] url = row.xpath('a[2]/@href')[0] ds = [title] ds.extend(_guba_content(url)) data.append(ds) df = pd.DataFrame(data, columns=nv.GUBA_SINA_COLS) df['rcounts'] = df['rcounts'].astype(float) return df if show_content is True else df.drop('content', axis=1) except Exception as er: print str(er)
def guba_sina(show_content=False): """ 获取sina财经股吧首页的重点消息 Parameter -------- show_content:是否显示内容,默认False Return -------- DataFrame title, 消息标题 content, 消息内容(show_content=True的情况下) ptime, 发布时间 rcounts,阅读次数 """ from pandas.io.common import urlopen try: with urlopen(nv.GUBA_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['sina'])) as resp: lines = resp.read() html = lxml.html.document_fromstring(lines) res = html.xpath('//ul[@class=\"list_05\"]/li') heads = html.xpath('//div[@class=\"tit_04\"]') data = [] for head in heads[:1]: title = head.xpath('a/text()')[0] url = head.xpath('a/@href')[0] ds = [title] ds.extend(_guba_content(url)) data.append(ds) for row in res: title = row.xpath('a[2]/text()')[0] url = row.xpath('a[2]/@href')[0] ds = [title] ds.extend(_guba_content(url)) data.append(ds) df = pd.DataFrame(data, columns=nv.GUBA_SINA_COLS) df['rcounts'] = df['rcounts'].astype(float) return df if show_content is True else df.drop('content', axis=1) except Exception as er: print str(er)
def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country='US', start=2002, end=2005): # Build URL for api call url = ("http://api.worldbank.org/countries/" + country + "/indicators/" + indicator + "?date=" + str(start) + ":" + str(end) + "&per_page=25000&format=json") # Download with urlopen(url) as response: data = response.read() # Parse JSON file data = json.loads(data)[1] country = [x['country']['value'] for x in data] iso2c = [x['country']['id'] for x in data] year = [x['date'] for x in data] value = [x['value'] for x in data] # Prepare output out = pandas.DataFrame([country, iso2c, year, value]).T return out
def can_connect(url, error_classes=_network_error_classes): """Try to connect to the given url. True if succeeds, False if IOError raised Parameters ---------- url : basestring The URL to try to connect to Returns ------- connectable : bool Return True if no IOError (unable to connect) or URLError (bad url) was raised """ try: with urlopen(url): pass except error_classes: return False else: return True
def get_quote_yahoo(symbols): """ Get current yahoo quote Returns a DataFrame """ if isinstance(symbols, basestring): sym_list = symbols else: sym_list = '+'.join(symbols) # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm request = ''.join(_yahoo_codes.itervalues()) # code request string header = _yahoo_codes.keys() data = defaultdict(list) url_str = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % (sym_list, request) with urlopen(url_str) as url: lines = url.readlines() for line in lines: fields = line.decode('utf-8').strip().split(',') for i, field in enumerate(fields): if field[-2:] == '%"': v = float(field.strip('"%')) elif field[0] == '"': v = field.strip('"') else: try: v = float(field) except ValueError: v = np.nan data[header[i]].append(v) idx = data.pop('symbol') return DataFrame(data, index=idx)
def get_travis_data(): """figure out what worker we're running on, and the number of jobs it's running """ import os jobid = os.environ.get("TRAVIS_JOB_ID") if not jobid: return None, None with urlopen("https://api.travis-ci.org/workers/") as resp: workers = json.loads(resp.read()) host = njobs = None for item in workers: host = item.get("host") id = ((item.get("payload") or {}).get("job") or {}).get("id") if id and str(id) == str(jobid): break if host: njobs = len( [x for x in workers if host in x['host'] and x['payload']]) return host, njobs
def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): self.ioargs = IOArgs( filepath_or_buffer=filepath_or_buffer, encoding=None, mode=None, compression={"method": None}, ) # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): self.ioargs = IOArgs( filepath_or_buffer=BytesIO(urlopen(filepath_or_buffer).read()), should_close=True, encoding=None, mode=None, compression={"method": None}, ) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): self.ioargs = get_filepath_or_buffer( filepath_or_buffer, storage_options=storage_options) if isinstance(self.ioargs.filepath_or_buffer, self._workbook_class): self.book = self.ioargs.filepath_or_buffer elif hasattr(self.ioargs.filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too assert not isinstance(self.ioargs.filepath_or_buffer, str) self.ioargs.filepath_or_buffer.seek(0) self.book = self.load_workbook(self.ioargs.filepath_or_buffer) elif isinstance(self.ioargs.filepath_or_buffer, str): self.book = self.load_workbook(self.ioargs.filepath_or_buffer) elif isinstance(self.ioargs.filepath_or_buffer, bytes): self.book = self.load_workbook( BytesIO(self.ioargs.filepath_or_buffer)) else: raise ValueError( "Must explicitly set engine if not passing in buffer or path for io." )
def _retry_read_url(url, retry_count, pause, name): """ Open url (and retry) """ for _ in range(retry_count): # kludge to close the socket ASAP try: with urlopen(url) as resp: lines = resp.read() except _network_error_classes: pass else: rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, parse_dates=True, na_values='-')[::-1] # Yahoo! Finance sometimes does this awesome thing where they # return 2 rows for the most recent business day if len(rs ) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover rs = rs[:-1] #Get rid of unicode characters in index name. try: rs.index.name = rs.index.name.decode('unicode_escape').encode( 'ascii', 'ignore') except AttributeError: #Python 3 string has no decode method. rs.index.name = rs.index.name.encode('ascii', 'ignore').decode() return rs time.sleep(pause) raise IOError("after %d tries, %s did not " "return a 200 for url %r" % (retry_count, name, url))
def get_data_famafrench(name): # path of zip files zip_file_url = ('http://mba.tuck.dartmouth.edu/pages/faculty/' 'ken.french/ftp/') zip_file_path = '{0}{1}.zip'.format(zip_file_url, name) with urlopen(zip_file_path) as url: raw = url.read() with tempfile.TemporaryFile() as tmpf: tmpf.write(raw) with ZipFile(tmpf, 'r') as zf: data = zf.read(name + '.txt').splitlines() line_lengths = np.array(map(len, data)) file_edges = np.where(line_lengths)[0] datasets = {} edges = itertools.izip(file_edges[:-1], file_edges[1:]) for i, (left_edge, right_edge) in enumerate(edges): dataset = [d.split() for d in data[left_edge:right_edge]] if len(dataset) > 10: ncol_raw = np.array(map(len, dataset)) ncol = np.median(ncol_raw) header_index = np.where(ncol_raw == ncol - 1)[0][-1] header = dataset[header_index] ds_header = dataset[header_index + 1:] # to ensure the header is unique header = [ '{0} {1}'.format(*items) for items in enumerate(header, start=1) ] index = np.fromiter((d[0] for d in ds_header), dtype=int) dataset = np.fromiter((d[1:] for d in ds_header), dtype=float) datasets[i] = DataFrame(dataset, index, columns=header) return datasets
def get_data_fred(name, start=dt.datetime(2010, 1, 1), end=dt.datetime.today()): """ Get data for the given name from the St. Louis FED (FRED). Date format is datetime Returns a DataFrame. """ start, end = _sanitize_dates(start, end) fred_URL = "http://research.stlouisfed.org/fred2/series/" url = fred_URL + '%s' % name + '/downloaddata/%s' % name + '.csv' with urlopen(url) as resp: data = read_csv(resp, index_col=0, parse_dates=True, header=None, skiprows=1, names=["DATE", name], na_values='.') try: return data.truncate(start, end) except KeyError: if data.ix[3].name[7:12] == 'Error': raise IOError("Failed to get the data. Check that {0!r} is " "a valid FRED series.".format(name)) raise
def get_all_results(repo_id=53976): # travis pydata/pandas id """Fetches the VBENCH results for all travis builds, and returns a list of result df unsuccesful individual vbenches are dropped. """ from collections import OrderedDict def get_results_from_builds(builds): dfs = OrderedDict() for build in builds: build_id = build['id'] build_number = build['number'] print(build_number) res = get_build_results(build_id) if res is not None: dfs[build_number] = res return dfs base_url = 'https://api.travis-ci.org/builds?url=%2Fbuilds&repository_id={repo_id}' url = base_url.format(repo_id=repo_id) url_after = url + '&after_number={after}' dfs = OrderedDict() while True: with urlopen(url) as r: if not (200 <= r.getcode() < 300): break builds = json.loads(r.read()) res = get_results_from_builds(builds) if not res: break last_build_number = min(res.keys()) dfs.update(res) url = url_after.format(after=last_build_number) return dfs
def dump_as_gist(data, desc="The Commit", njobs=None): host, njobs2 = get_travis_data()[:2] if njobs: # be slightly more reliable njobs = max(njobs, njobs2) content = dict( version="0.1.1", timings=data, datetime=get_utcdatetime(), # added in 0.1.1 hostname=host, # added in 0.1.1 njobs=njobs # added in 0.1.1, a measure of load on the travis box ) payload = dict(description=desc, public=True, files={'results.json': dict(content=json.dumps(content))}) try: with closing( urlopen("https://api.github.com/gists", json.dumps(payload), timeout=WEB_TIMEOUT)) as r: if 200 <= r.getcode() < 300: print("\n\n" + "-" * 80) gist = json.loads(r.read()) file_raw_url = list(gist['files'].items())[0][1]['raw_url'] print("[vbench-gist-raw_url] %s" % file_raw_url) print("[vbench-html-url] %s" % gist['html_url']) print("[vbench-api-url] %s" % gist['url']) print("-" * 80 + "\n\n") else: print("api.github.com returned status %d" % r.getcode()) except: print("Error occured while dumping to gist")
def get_dividends_yahoo(sid, start, end): # Taken from get_data_yahoo in Pandas library and adjust a single parameter to get dividends from pandas.compat import StringIO, bytes_to_str from pandas.io.common import urlopen start, end = pd.to_datetime(start), pd.to_datetime(end) url = ( "http://ichart.finance.yahoo.com/table.csv?" + "s=%s" % sid + "&a=%s" % (start.month - 1) + "&b=%s" % start.day + "&c=%s" % start.year + "&d=%s" % (end.month - 1) + "&e=%s" % end.day + "&f=%s" % end.year + "&g=v" + "&ignore=.csv" # THE CHANGE ) with urlopen(url) as resp: lines = resp.read() rs = pd.read_csv(StringIO(bytes_to_str(lines)), index_col=0, parse_dates=True, na_values="-")[::-1] # Yahoo! Finance sometimes does this awesome thing where they # return 2 rows for the most recent business day if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover rs = rs[:-1] return rs
def get_symbol_yahoo_stats_url(symbols): """ Get the symbols' basic statistics from Yahoo Finance. Input: symbols - a list of symbol strings, e.g. ['AAPL'] Output: stats in Pandas DataFrame. This function is ported from pandas_datareader/yahoo/components.py """ sym_list = str2list(symbols) if sym_list == None: return DataFrame() url_str = 'http://download.finance.yahoo.com/d/quotes.csv?' # Form a BUNCH of STOCK SYMBOLS separated by "+", # e.g. XOM+BBDb.TO+JNJ+MSFT sym_str = '+'.join(sym_list) url_str += 's=' + sym_str url_str = url_str.strip().replace(' ','') # remove all spaces # Yahoo Finance tags, refer to http://www.financialwisdomforum.org/gummy-stuff/Yahoo-data.htm tags = {'s':'Symbol', 'x':'Exchange', 'j1':'Market Cap', 'b4':'Book Value', 'r':'P/E', 'p5':'Price/Sales', 'p6':'Price/Book', 'j4':'EBITDA', 'j':'52-week Low', 'k':'52-week High', 'l1':'Last Trade', 'd':'Dividend/Share', 'y':'Dividend Yield', 'e':'EPS', 's7':'Short Ratio', 's1':'Shares Owned', 'f6':'Float Shares'} url_str += '&f=' + ''.join(pd.compat.iterkeys(tags)) with urlopen(url_str) as resp: raw = resp.read() lines = raw.decode('utf-8').strip().replace('"', '').split('\n') lines = [line.strip().split(',') for line in lines] if len(lines) < 1 or len(lines[0]) < len(tags) : print('Error: failed to download Yahoo stats from %s' %url_str) return DataFrame() stats = DataFrame(lines, columns=list(tags.values())) stats = stats.drop_duplicates() stats = stats.set_index('Symbol') return stats
def _read(io): """Try to read from a url, file or string. Parameters ---------- io : str, unicode, or file-like Returns ------- raw_text : str """ if _is_url(io): with urlopen(io) as url: raw_text = url.read() elif hasattr(io, 'read'): raw_text = io.read() elif os.path.isfile(io): with open(io) as f: raw_text = f.read() elif isinstance(io, string_types): raw_text = io else: raise TypeError("Cannot read object of type %r" % type(io).__name__) return raw_text
def query_osm(typ, bbox=None, recurse=None, tags='', raw=False, meta=False, operation='and', **kwargs): """ Query the Overpass API to obtain OpenStreetMap data. See also: http://wiki.openstreetmap.org/wiki/Overpass_API/Language_Guide The OSM XML data is parsed into an intermediate set of DataFrames. By passing in 'render=False', this will return these DataFrames stored as the OSMData namedtuple. If render is True, then the DataFrames are built into their corresponding geometries. Parameters ---------- typ : {'node', 'way', 'relation'} The type of OSM data to query bbox : (min lon, min lat, max lon, max lat) bounding box Optional bounding box to restrict the query. Unless the query is extremely restricted, you usually want to specify this. It can be retrieved from GeoPandas objects as 'df.total_bounds' or from Shapely objects as 'geom.bounds' recurse : {'up, 'down', 'uprel', 'downrel'} This is used to get more data than the original query. If 'typ' is 'way', you'll usually want this set to 'down' which grabs all nodes of the matching ways tags : string or list of query strings See also the OverpassQL (referenced above) for more tag options Examples: tags='highway' Matches objects with a 'highway' tag tags='highway=motorway' <-- Matches ob Matches objects where the 'highway' tag is 'motorway' tags='name~[Mm]agazine' Match if the 'name' tag matches the regular expression Specify a list of tag requests to match all of them or the any of them tags=['highway', 'name~"^Magazine"'] Match tags that have 'highway' and where 'name' starts with 'Magazine' raw : boolean, default False Return the raw XML data returned by the request render : boolean, default True Parse the output and return a final GeoDataFrame meta : boolean, default False Indicates whether to query the metadata with each OSM object. This includes the changeset, timestamp, uid, user, and version. operation: {'and', 'or'}, default 'and' the operation of query conditions 'and' : return a list of tag requests to match all of them 'or' : return a list of tag requests to match any of them way_type:{'Line','Polygon'} (optional) when typ equals 'way' 'Line' : the type of geometry in geodataframe is LineString 'Polygon' : the type of geometry in geodataframe is Polygon Returns ------- df - GeoDataFrame Note that there's probably a bit more filtering required to get the exact desired data. For example if you only want ways, you may want to grab only the linestrings like: >>> df = df[df.type == 'LineString'] """ url = _build_url(typ, operation, bbox, recurse, tags, meta) # TODO: Raise on non-200 (or 400-599) with urlopen(url) as response: content = response.read() if raw: return content return read_osm(content, **kwargs)
def get_COOPS_json(begin_dt, end_dt, base_url): """Function accepts: a base url (API endpoint), a beginning and end datetime string in the form 'YYYYMMDD mm:ss' which are <= 1 year apart, passing these to the query_builder function. Function returns the hourly prediction data as a PANDAS DataFrame Object where the returned time becomes the datetime index.""" # import dependencies import pandas as pd import numpy as np from pandas.io.common import urlopen from pandas.io import json # construct the query query, query_dict = query_builder(begin_dt, end_dt, base_url) # execute query and read response with urlopen(query) as response: data = response.read() # convert json object to python dictionary and extract time and values for predictions data = json.loads(data)['predictions'] # read into PANDAS DataFrame, then manipulate DataFrame object data = pd.DataFrame(data) data.columns = ['Date_Time', 'Level'] data.index = data.Date_Time data.index = pd.to_datetime(data.index) data = data.drop('Date_Time', axis=1) # reindex to fill in any missing time values, this needs # work to initialize the range on the data/query vs. hardcoding as it # currently stands. periods, begin, end = dt_periodizer(query_dict) begin_string = begin.strftime('%Y-%m-%d %H:%M:%S') rng = pd.date_range(begin_string, periods=periods, freq='6min') # the actual reindex itself needs to be reworked for a better fill # a good start might be the median of the points directly above and # below the missing dt index. Since this is very few points typically # I am filling them with 100 for easy removal later. I would rather # remove the points than fill in a non-measured value. # protect against index duplicates data = data.reset_index() # reindex to datetime data = data.reindex(rng, fill_value=100) # convert value from string to float data.Level = data.Level.astype(float) # adjust level to account for distance of Carkeek from NOAA # monitoring station (+ 5.5%) level_adjust = data.Level.values + (.05 * data.Level.values) data.Level = np.round(level_adjust, decimals=2) # add date column to dataframe for later use with weather data data['Date'] = data.index.date # add a column for hourly re-sample # data['Hour'] = data.index.hour # data['Time'] = data.index.time # return DataFrame object return data
def get_hist_data(code=None, start=None, end=None, ktype='D', retry_count=3, pause=0.001): """ 获取个股历史交易记录 Parameters ------ code:string 股票代码 e.g. 600848 start:string 开始日期 format:YYYY-MM-DD 为空时取到API所提供的最早日期数据 end:string 结束日期 format:YYYY-MM-DD 为空时取到最近一个交易日数据 ktype:string 数据类型,D=日k线 W=周 M=月 5=5分钟 15=15分钟 30=30分钟 60=60分钟,默认为D retry_count : int, 默认 3 如遇网络等问题重复执行的次数 pause : int, 默认 0 重复请求数据过程中暂停的秒数,防止请求间隔时间太短出现的问题 return ------- DataFrame 属性:日期 ,开盘价, 最高价, 收盘价, 最低价, 成交量, 价格变动 ,涨跌幅,5日均价,10日均价,20日均价,5日均量,10日均量,20日均量,换手率 """ symbol = code_to_symbol(code) url = '' if ktype.upper() in ct.K_LABELS: url = ct.DAY_PRICE_URL%(ct.P_TYPE['http'], ct.DOMAINS['ifeng'], ct.K_TYPE[ktype.upper()], symbol) elif ktype in ct.K_MIN_LABELS: url = ct.DAY_PRICE_MIN_URL%(ct.P_TYPE['http'], ct.DOMAINS['ifeng'], symbol, ktype) else: raise TypeError('ktype input error.') for _ in range(retry_count): time.sleep(pause) try: with urlopen(url) as resp: lines = resp.read() except _network_error_classes: pass else: js = json.loads(lines) cols = [] if (code in ct.INDEX_LABELS) & (ktype.upper() in ct.K_LABELS): cols = ct.INX_DAY_PRICE_COLUMNS else: cols = ct.DAY_PRICE_COLUMNS df = pd.DataFrame(js['record'], columns=cols) if ktype.upper() in ['D','W','M']: df = df.applymap(lambda x: x.replace(u',', u'')) for col in cols[1:]: df[col] = df[col].astype(float) if start is not None: df = df[df.date >= start] if end is not None: df = df[df.date <= end] if (code in ct.INDEX_LABELS) & (ktype in ct.K_MIN_LABELS): df = df.drop('turnover', axis=1) df = df.set_index('date') return df raise IOError("%s获取失败,请检查网络和URL:%s" % (code, url))
def get_data_yahoo_actions(symbol, start=None, end=None, retry_count=3, pause=0.001): """ Returns DataFrame of historical corporate actions (dividends and stock splits) from symbols, over date range, start to end. Parameters ---------- sym : string with a single Single stock symbol (ticker). start : string, (defaults to '1/1/2010') Starting date, timestamp. Parses many different kind of date representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') end : string, (defaults to today) Ending date, timestamp. Same format as starting date. retry_count : int, default 3 Number of times to retry query request. pause : int, default 0 Time, in seconds, of the pause between retries. """ start, end = _sanitize_dates(start, end) url = (_HISTORICAL_YAHOO_ACTIONS_URL + 's=%s' % symbol + '&a=%s' % (start.month - 1) + '&b=%s' % start.day + '&c=%s' % start.year + '&d=%s' % (end.month - 1) + '&e=%s' % end.day + '&f=%s' % end.year + '&g=v') for _ in range(retry_count): time.sleep(pause) try: with urlopen(url) as resp: lines = resp.read() except _network_error_classes: pass else: actions_index = [] actions_entries = [] for line in csv.reader(StringIO(bytes_to_str(lines))): # Ignore lines that aren't dividends or splits (Yahoo # add a bunch of irrelevant fields.) if len(line) != 3 or line[0] not in ('DIVIDEND', 'SPLIT'): continue action, date, value = line if action == 'DIVIDEND': actions_index.append(to_datetime(date)) actions_entries.append({ 'action': action, 'value': float(value) }) elif action == 'SPLIT' and ':' in value: # Convert the split ratio to a fraction. For example a # 4:1 split expressed as a fraction is 1/4 = 0.25. denominator, numerator = value.split(':', 1) split_fraction = float(numerator) / float(denominator) actions_index.append(to_datetime(date)) actions_entries.append({ 'action': action, 'value': split_fraction }) return DataFrame(actions_entries, index=actions_index) raise IOError("after %d tries, Yahoo! did not " "return a 200 for url %r" % (retry_count, url))