class FundMapping(object): def __init__(self): self._collection = Mongodb('192.168.250.200', 27017, 'fund', 'base_fund') self._url = 'http://fund.csrc.gov.cn/web/classification_show.organization' def get_fund_mapping(self): # sub_code, sub_name, main_code, main_name sub_to_main_mapping = [] html = requests.get(self._url, timeout=30.0).content document = PyQuery(unicode(html, 'utf-8')) fund_blocks = [document.items('.aa'), document.items('.dd')] for each_block in fund_blocks: for class_tag in each_block: items_list = [item.text() for item in class_tag.items('td')] sub_to_main_mapping.append((items_list[1], items_list[3])) return dict(sub_to_main_mapping) def update_to_mongo(self): fund_mapping = self.get_fund_mapping() for item in self._collection.query(kwargs={'code': 1}).sort([('_id', 1)]): key = item['code'][:6] main_fund_code = fund_mapping.get(key) if main_fund_code is not None: regex = re.compile(r'{0}'.format(main_fund_code)) main_fund_sid = self._collection.get({'code': regex}, {'sid': 1}) print 'main:', main_fund_sid _main = (main_fund_sid or {}).get('sid', '') self._collection.update({'_id': item['_id']}, setdata={'main': _main})
def third_update(): coll_in = Mongodb('192.168.251.95', 27017, 'news', 'announcement_hk_chz') coll_cat = Mongodb('192.168.251.95', 27017, 'ada', 'dict_announce_catalog_hk') coll_secu = Mongodb('192.168.251.95', 27017, 'ada', 'base_stock') kt = 0 cdctuo = ThirdUpdate().main() cd_dt_cat_tit_url_ori = cdctuo if cdctuo else [] for codes, dt, cat, title, url, cat_origin in cd_dt_cat_tit_url_ori: kt += 1 for code in codes: secu = get_secu(code, coll_secu) if secu and not coll_in.get({'sid': url, 'secu.0.cd': secu[0]['cd']}, {'title': 1}): print 'kt:', kt, '|', code, '|', dt, '|', url, '\n|', title try: hk_data = post_dict(secu, dt, cat, title, url, cat_origin, coll_cat) coll_in.insert(hk_data) except Exception as e: print 'Error:', e.message # 创建索引 # inds_mon = coll_in.get({'sid': url}, {'title': 1}) # ind_url = "http://192.168.250.205:17081/indexer/services/indexes/delta.json?" \ # "indexer=announce_hkz&taskids=" # if inds_mon: # jdata = BaseDownloadHtml().get_html(ind_url + str(inds_mon['_id']))[0] # if json.loads(jdata)['code'] == 200: # print '\tcreate index is ok!\n\n' coll_in.disconnect() coll_cat.disconnect() coll_secu.disconnect()
def main(self, query=None): if query is None: query_date = [str(datetime.date.today())] else: query_date = query flag = False min_date = min(query_date) coll = Mongodb('192.168.251.95', 27017, 'news', 'research_report_def') url = 'http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?' query_string = 'type=SR&sty=GGSR&ps=50&p=%s&mkt=0&stat=0&cmd=2&code=&rt=' for page in range(1, 20): py_data = json.loads(self.get_html(url + query_string % str(page), encoding=True)[1:-1]) for data in py_data: code, agency = data['secuFullCode'][:6], data['insName'] date_time, url_info_code = data['datetime'][:10], data['infoCode'] report_url = 'http://data.eastmoney.com/report/%s/%s.html' % (date_time.replace('-', ''), url_info_code) if date_time in query_date: src = self.rr_research_org_code(agency) or '' # get src secu = self.base_stock_code(code) or '' # get secu if coll.get({'url': report_url}, {'titl': 1}) is None: try: now_html = self.get_html(report_url, encoding=True) title = self.remove_tag(self.__title.findall(now_html)[0]) content = self.remove_tag(self.__content.findall(now_html)[0]) to_data = { 'url': report_url, 'titl': {'szh': title, 'en': ''}, 'bio': {'en': '', 'szh': content}, 'rdt': date_time, 'upu': '', 'typ': '30001', 'stat': 1, 'upt': datetime.datetime.now(), 'crt': datetime.datetime.now(), } to_data.update({'src': src, 'secu': secu}) if not src or not secu: vn_src = '' if src else agency vn_secu = '' if secu else code to_data['vn'] = '^'.join([vn_src, vn_secu]) else: to_data['vn'] = None coll.insert(to_data) print '[%s %s FROM %s] -->>> Now insert mongodb!' % (code, date_time, agency) except Exception as e: print 'title: %s, url: %s' % (data['title'], report_url), 'Error:', e else: print '[%s %s FROM %s] -->>> mongodb table is existed' % (code, date_time, agency) elif date_time < min_date: flag = True break if flag: break coll.disconnect()
def main(self): if not self._validity: print "SZX this is Saturday or Monday!" return 0 coll_in = Mongodb("192.168.251.95", 27017, "ada", "base_margin_trading") coll_stock = Mongodb("192.168.251.95", 27017, "ada", "base_stock") coll_fund = Mongodb("192.168.251.95", 27017, "fund", "base_fund") url = "http://www.szse.cn/szseWeb/FrontController.szse?randnum=&" t = lambda v: "%.4f" % float(v) for page in range(1, 30): break_point = False html = self.get_html(url + self._query_string.format(self._query_date, page), encoding=True) for it in self.extract(html): # print it[0], it[1], it[2], it[3], it[4], it[5], it[6] break_point = True secu_cd = secu_code(it[0], coll_stock, coll_fund) fiba_bre = szx_fiba_bre(secu_cd, coll_in, self._query_date) sema_bre = szx_sema_bre(secu_cd, coll_in, self._query_date) # 本日融资偿还额 = 前日融资余额 + 本日融资买入- 本日融资余额(元) (fi.re = fi.ba(上期) + fi.bu - fi.ba) # 融券偿还量 = 融券卖出量 + 融券余量(上期) - 融券余量 (se.re = se.so + se.ma(上期) - se.ma) szx_fs_data = { "secu": secu_cd or it[0], "date": self._query_date, "total": t(it[6]), "stat": 2, "typ": "szx", "crt": datetime.now(), "fi": {"ba": t(it[2]), "bu": t(it[1]), "re": t(float(it[1]) + fiba_bre - float(it[2]))}, "se": { "ba": t(it[5]), "ma": t(it[4]), "so": t(it[3]), "re": t(float(it[3]) + sema_bre - float(it[4])), }, "upt": datetime.now(), } print szx_fs_data if not coll_in.get({"secu": secu_cd or it[0], "date": self._query_date, "typ": "szx"}): print coll_in.insert(szx_fs_data) if not break_point: break print u"szx [%s] 融资融券交易明细 day update: %d page done!" % (self._query_date, page) # break coll_in.disconnect() coll_stock.disconnect() coll_fund.disconnect()
def insert_db(self, total_data): coll_in = Mongodb('192.168.251.95', 27017, 'ada', 'base_margin_trading') coll_stock = Mongodb('192.168.251.95', 27017, 'ada', 'base_stock') coll_fund = Mongodb('192.168.251.95', 27017, 'fund', 'base_fund') sql_db = MySQLClient("192.168.251.95", "python_team", "python_team", "ada-fd") print '\tnow start to insert mongodb, waiting......' d = (lambda v: '%.4f' % float(v)) for pdt in total_data: # 信用交易日期 标的证券代码 标的证券简称 本日融资余额(元) 本日融资买入额(元) # 本日融资偿还额(元) 本日融券余量 本日融券卖出量 本日融券偿还量 secu_cd = secu_code(pdt[1], coll_stock, coll_fund) trade_date = '-'.join([pdt[0][:4], pdt[0][4:6], pdt[0][6:]]) uid = str(uuid.uuid5(uuid.NAMESPACE_DNS, ''.join(self._valid(pdt)).encode('u8'))) data = { 'secu': secu_cd or pdt[1], 'date': trade_date, 'total': d(int(pdt[3])), 'stat': 2, 'typ': 'sha', 'crt': datetime.now(), 'uuid': uid, 'fi': { 'ba': d(pdt[3]), 'bu': d(pdt[4]), 're': d(pdt[5]) }, 'se': { 'ba': '0.0000', 'ma': d(pdt[6]), 'so': d(pdt[7]), 're': d(pdt[8]) }, 'upt': datetime.now() } if coll_in.get({'uuid': uid, 'typ': 'sha'}, {'secu': 1}): continue elif secu_cd is None: coll_in.insert(data) else: seba = sha_seba(secu_cd, pdt[6], trade_date, sql_db) if seba is not None: data['total'] = d(int(pdt[3]) + seba) data['se']['ba'] = d(seba) coll_in.insert(data) coll_in.disconnect() coll_stock.disconnect() sql_db.disconnect() print '\tinsert all done!'
def update(): coll_in = Mongodb('192.168.251.95', 27017, 'news', 'announcement_hk_chz') coll_cat = Mongodb('192.168.251.95', 27017, 'ada', 'dict_announce_catalog_hk') coll_secu = Mongodb('192.168.251.95', 27017, 'ada', 'base_stock') count = 0 for code, query in codes_date: ktt = 0 count += 1 validate(code, query) print '[%s-->>%s,%s]' % (count, code, query), ':waiting few minutes......\n' dctu = PoskUpdate(code, query).main() # codes, date, cat, title, url for codes, dt, cat, title, url, cat_origin in dctu: ktt += 1 print '\t[%s ->> ktt:%s]' % (code, ktt), '|', codes, '|', dt, '|', title, '|', url for code_ in codes: secu = get_secu(code_, coll_secu) print 'secu:', secu if secu and not coll_in.get({'sid': url}, {'title': 1}): try: hk_data = post_dict(secu, dt, cat, title, url, cat_origin, coll_cat) coll_in.insert(hk_data) except Exception as e: print '\t[%s] |%s|upload error: %s!' % (code_, dt, e.message) # inds_mon = coll_in.get({'sid': url}, {'title': 1}) # ind_url = "http://192.168.250.205:17081/indexer/services/indexes/delta.json?" \ # "indexer=announce_hkz&taskids=" # if inds_mon: # 创建索引 # jdata = BaseDownloadHtml().get_html(ind_url + str(inds_mon['_id']))[0] # if json.loads(jdata)['code'] == 200: # print '\tcreate index is ok!\n\n' if ktt % 80 == 0: sleep(2 * 60) coll_in.disconnect() coll_cat.disconnect() coll_secu.disconnect()
def main(self): if not self._validity: print 'SZX this is Saturday or Monday!' return 0 coll_in = Mongodb('192.168.251.95', 27017, 'ada', 'base_margin_trading') coll_stock = Mongodb('192.168.251.95', 27017, 'ada', 'base_stock') coll_fund = Mongodb('192.168.251.95', 27017, 'fund', 'base_fund') url = 'http://www.szse.cn/szseWeb/FrontController.szse?randnum=&' t = lambda v: '%.4f' % float(v) for page in range(1, 30): break_point = False html = self.get_html( url + self._query_string.format(self._query_date, page), encoding=True) for it in self.extract(html): # print it[0], it[1], it[2], it[3], it[4], it[5], it[6] break_point = True secu_cd = secu_code(it[0], coll_stock, coll_fund) fiba_bre = szx_fiba_bre(secu_cd, coll_in, self._query_date) sema_bre = szx_sema_bre(secu_cd, coll_in, self._query_date) # 本日融资偿还额 = 前日融资余额 + 本日融资买入- 本日融资余额(元) (fi.re = fi.ba(上期) + fi.bu - fi.ba) # 融券偿还量 = 融券卖出量 + 融券余量(上期) - 融券余量 (se.re = se.so + se.ma(上期) - se.ma) szx_fs_data = { 'secu': secu_cd or it[0], 'date': self._query_date, 'total': t(it[6]), 'stat': 2, 'typ': 'szx', 'crt': datetime.now(), 'fi': { 'ba': t(it[2]), 'bu': t(it[1]), 're': t(float(it[1]) + fiba_bre - float(it[2])) }, 'se': { 'ba': t(it[5]), 'ma': t(it[4]), 'so': t(it[3]), 're': t(float(it[3]) + sema_bre - float(it[4])) }, 'upt': datetime.now() } print szx_fs_data if not coll_in.get({ 'secu': secu_cd or it[0], 'date': self._query_date, 'typ': 'szx' }): print coll_in.insert(szx_fs_data) if not break_point: break print u'szx [%s] 融资融券交易明细 day update: %d page done!' % ( self._query_date, page) # break coll_in.disconnect() coll_stock.disconnect() coll_fund.disconnect()