def _update_ohlc_daily(date, symbol, table, exchange): if symbol == 'ALL': items = get_all_symbols(exchange) else: items = [symbol] data = pd.DataFrame() cur_idx = 0 total = len(items) for t in items: cur_idx += 1 print "%s/%s .. " % (cur_idx, total) df = get_date_ohlc(exchange, t, date) if df is not None: if len(df) == 1: data = data.append(df) else: yyhtools.info("%s %s %s wrong data." % (exchange, symbol, date)) yyhtools.error(str(df)) yyhtools.info("_update_ohlc_daily finished") if len(data) == 0: return None return data
def _update_ohlc_daily(date, curr_id, table): if curr_id == 0: items = codes.all_items else: item = codes.currid2item.get(curr_id) if not item: yyhtools.error("curr_id=%s not fund" % curr_id) items = [item] data = pd.DataFrame() cur_idx = 0 total = len(items) for t in items: cur_idx +=1 print "%s/%s .. " % (cur_idx, total) PAGE_URL = INVESTING_HOST + t["page_url"] df = get_data(PAGE_URL, API_URL, t["curr_id"], date) if df is not None: if len(df) == 1: data = data.append(df) else: yyhtools.info("%s(curr_id=%s) 获取重复数据." % (t['name'], t['curr_id'])) yyhtools.error(str(df)) # else: # yyhtools.error("%s(curr_id=%s) is None" % (t['name'], t['curr_id'])) print "_update_ohlc_daily finished" if len(data) == 0: return None return data
def get_data(exchange): df = None for _ in range(3): try: time.sleep(0.1) csv_path = "http://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=%s&render=download" % exchange df = pd.read_csv(csv_path) df['Symbol'] = df['Symbol'].apply(string.strip).apply(string.rstrip) df['MarketCap'] = df['MarketCap'].apply(marketcap_to_float) df.to_sql('us_%s' % exchange, engine, if_exists='replace', index=True, index_label='id') ytrack.success("us_%s 数据更新成功" % exchange) break except requests.exceptions.ConnectionError as e: yyhtools.error(traceback.format_exc()) return if df is None: ytrack.error("us_%s 数据更新失败." % exchange) return symbols = df['Symbol'].values.tolist() sql = 'select Symbol, cid from us_%s_cid' % exchange try: a = engine.execute(sql) aa = a.fetchall() cids = {} for symbol, cid in aa: cids[symbol] = cid for symbol in symbols: if symbol not in cids: cid = get_cid(exchange, symbol) sql = 'insert into us_%s_cid(Symbol, cid) values("%s", "%s")' % (exchange, symbol, cid) engine.execute(sql) ytrack.success("cid(%s, %s)=%s" % (exchange, symbol, cid)) except Exception as e: yyhtools.error(trackback.format_exc())
def get_all_data(items, data_dir): if DEBUG: items = items[:1] for t in items: dst = "%s/%s/%s.csv" % (CURDIR, data_dir, t['code']) if os.path.exists(dst): yyhtools.error("%s exists.." % dst) continue PAGE_URL = INVESTING_HOST + t["page_url"] df = get_data(PAGE_URL, API_URL, t["curr_id"]) if df is not None: df.to_csv(dst) yyhtools.info("%s finished." % dst) else: yyhtools.error("%s is None" % dst)
def get_cid(exchange, symbol): for _ in range(3): try: time.sleep(0.005) page_url = 'https://www.google.com.hk/finance/historical?q=%s:%s' % (exchange, symbol) r = s.get(page_url, proxies=proxies) html = lxml.html.parse(StringIO(r.text)) res = html.xpath('//input[@name=\"cid\"]') if len(res) > 0: node = res[0] return node.value return '0' except Exception as e: print traceback.format_exc() yyhtools.error(traceback.format_exc()) return '0'
def get_all_symbols(exchange): try: sql = 'select Symbol, cid from us_%s_cid' % exchange a = engine.execute(sql) aa = a.fetchall() cids = {} symbols = [] for symbol, cid in aa: if not cid or cid == '0': continue symbols.append(symbol) yyhtools.success("get all symbols len(symbols) = %s" % (len(symbols))) return symbols except Exception as e: yyhtools.success("get empty symbols") yyhtools.error(trackback.format_exc()) return []
def get_date_ohlc(exchange, symbol, date): print exchange, symbol, date for _ in range(3): try: time.sleep(0.005) page_url = 'https://www.google.com.hk/finance/historical?q=%s:%s' % ( exchange, symbol) r = s.get(page_url, proxies=proxies) html = lxml.html.parse(StringIO(r.text)) res = html.xpath('//table[@class=\"gf-table historical_price\"]') sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) if sarr == '': return None df = pd.read_html(sarr, skiprows=[0])[0] df.columns = ['date', 'open', 'high', 'low', 'close', 'amount'] df = df.drop('amount', axis=1) def date_to_int(s): y, m, d = s.split("-") return int(y) * 10000 + int(m) * 100 + int(d) df['date'] = df['date'].apply(date_to_int) # df['date'] = pd.to_datetime(df['date'], format=u"%Y-%m-%d") df = df.drop_duplicates('date') cmp_d = int(date.strftime("%Y%m%d")) df = df[df.date == cmp_d] if len(df) > 0: df['date'] = int(date.strftime("%Y%m%d")) code = get_code(symbol) assert code > 0, 'symbol code is %s' % code df.insert(0, 'code', code) df = df.set_index('code') return df return None except Exception as e: print traceback.format_exc() yyhtools.error(traceback.format_exc()) return None
def get_data(): page_url = 'http://www.jin10.com/' s = requests.Session() s.headers.update({ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36", }) r = None for _ in range(3): try: time.sleep(0.5) r = s.get(page_url) break except requests.exceptions.ConnectionError as e: yyhtools.error("%s" % (page_url)) yyhtools.error(traceback.format_exc()) return if r is None: yyhtools.error("requests.get('%s') is None" % page_url) return soup = BeautifulSoup(r.text.encode(r.encoding)) allnews = soup.findAll("div", {"class": "newsline"}) cnt = 0 session = Session() for news in allnews: try: id = long(news.attrs.get('id')) / 100 html = str(news) session.merge(News(id=id, html=html)) cnt += 1 except: ytrack.fail(traceback.format_exc()) session.commit() ytrack.success("%s 成功更新 %s 条记录." % ('jin10_news', cnt))
def get_data(page_url, api_url, curr_id): s = requests.Session() s.headers.update({ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36", }) for _ in range(3): try: time.sleep(0.5) resp = s.get(page_url) break except requests.exceptions.ConnectionError as e: yyhtools.error("%s %s %s" % (page_url, api_url, curr_id)) yyhtools.error(traceback.format_exc()) return s.headers.update({"X-Requested-With": "XMLHttpRequest"}) data = { "action": "historical_data", "curr_id": str(curr_id), "interval_sec": "Daily" } end_date = datetime.datetime(2016, 12, 2, 0, 0) result = pd.DataFrame() while True: st_date = end_date - datetime.timedelta(days=500) data['st_date'] = str(st_date.strftime("%Y/%m/%d")) data['end_date'] = str(end_date.strftime("%Y/%m/%d")) r = None for _ in range(3): try: time.sleep(0.5) r = s.post(api_url, data=data) break except requests.exceptions.ConnectionError as e: yyhtools.error("%s %s %s" % (page_url, api_url, curr_id)) yyhtools.error(traceback.format_exc()) continue if r is None: break html = lxml.html.parse(StringIO(r.text)) try: res = html.xpath('//table[@id=\"curr_table\"]') except Exception as e: yyhtools.error("%s %s %s" % (page_url, api_url, curr_id)) yyhtools.error(traceback.format_exc()) break if six.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) if sarr == '': break df = pd.read_html(sarr)[0] if len(df) == 0: break if len(df) == 1 and df.iloc[0][u'日期'] == 'No results found...': break result = result.append(df, ignore_index=True) end_date = st_date - datetime.timedelta(days=1) if len(df) < 10: print df if DEBUG: break if len(result) > 0: if len(result.columns) == 6: result.columns = [ 'date', 'close', 'open', 'high', 'low', 'percentage' ] else: result.columns = [ 'date', 'close', 'open', 'high', 'low', 'amount', 'percentage' ] result['date'] = pd.to_datetime(result['date'], format=u"%Y年%m月%d日") return result return None
def get_data(page_url, api_url, curr_id, end_date): ''' 取 end_date 这一天的数据 ''' for _ in range(3): try: # time.sleep(0.5) # resp = s.get(page_url) break except (requests.exceptions.ConnectionError, Exception) as e: yyhtools.error("%s %s %s" % (page_url, api_url, curr_id)) yyhtools.error(traceback.format_exc()) return data = {"action": "historical_data", "curr_id": str(curr_id), "interval_sec": "Daily"} # end_date = datetime.datetime(2016, 12, 2, 0, 0) st_date = end_date data['st_date'] = str(st_date.strftime("%Y/%m/%d")) data['end_date'] = str(end_date.strftime("%Y/%m/%d")) r = None for _ in range(3): try: time.sleep(0.5) r = s.post(api_url, data=data) break except requests.exceptions.ConnectionError as e: yyhtools.error("%s %s %s" % (page_url, api_url, curr_id)) yyhtools.error(traceback.format_exc()) continue if r is None: return html = lxml.html.parse(StringIO(r.text)) try: res = html.xpath('//table[@id=\"curr_table\"]') except Exception as e: yyhtools.error("%s %s %s" % (page_url, api_url, curr_id)) yyhtools.error(traceback.format_exc()) return if six.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) if sarr == '': return df = pd.read_html(sarr)[0] if len(df) == 0: return if len(df) == 1 and df.iloc[0][u'日期'] == 'No results found...': return if len(df.columns) == 6: df.columns = ['date', 'close', 'open', 'high', 'low', 'volume'] df.insert(5, 'amount', 0) else: df.columns = ['date', 'close', 'open', 'high', 'low', 'amount', 'volume'] df['date'] = pd.to_datetime(df['date'], format=u"%Y年%m月%d日") df = df.drop('volume', axis=1) df = df.drop('amount', axis=1) df = df[df.date==end_date] if len(df) > 1: yyhtools.error("%s %s数据重复" % (curr_id, end_date)) yyhtools.error(str(df)) df = df[:1] df.insert(0, 'code', curr_id) df = df.set_index('code') df['date'] = int(end_date.strftime("%Y%m%d")) return df