def get_base_stock_code(stock_code): coll = Mongodb('192.168.251.95', 27017, 'ada', 'base_stock') try: for d in coll.query({'tick': stock_code}).sort([('crt', 1)]): return d.get('code') except Exception as e: print 'no get code by base_stock. Error:', e
def handle_163_text(): """ 找出208 sum 字段有影响的文本记录 """ pattern = re.compile(r'网易财经会赚钱的客户端|网易财经 会赚钱的客户端') query_cond = {'url': {'$regex': re.compile(r'163\.com')}, 'ratio': 0} coll = Mongodb('192.168.0.223', 27017, 'news_crawl', 'hot_news') # print coll.query({'content': {'$regex': re.compile(r'%s' % text)}}).count() for k, doc in enumerate(coll.query(query_cond), 1): content = doc['content'] if pattern.search(content) is not None: url = doc['url'] title = doc['title'].split('重点推荐', 1)[0] auth = doc['author'] pub_date = doc['date'] cat = doc.get('source') ratio = doc.get('ratio') new_content = pattern.split(content, 1)[0].split('div,h1', 1)[0].strip('#&# ') if cat and new_content: lines = [ url, title, auth, str(pub_date), new_content, cat, str(ratio) ] write(path + '20151119/', str(pub_date), lines) print 'id:', doc['_id'], k coll.disconnect()
def base_stock_code(self, stock_code): coll = Mongodb('192.168.251.95', 27017, 'ada', 'base_stock') try: for d in coll.query({'tick': stock_code}).sort([('crt', 1)]): return d.get('code') except Exception as e: print 'no get code by base_stock. Error:', e
class FundMapping(object): def __init__(self): self._collection = Mongodb('192.168.250.200', 27017, 'fund', 'base_fund') self._url = 'http://fund.csrc.gov.cn/web/classification_show.organization' def get_fund_mapping(self): # sub_code, sub_name, main_code, main_name sub_to_main_mapping = [] html = requests.get(self._url, timeout=30.0).content document = PyQuery(unicode(html, 'utf-8')) fund_blocks = [document.items('.aa'), document.items('.dd')] for each_block in fund_blocks: for class_tag in each_block: items_list = [item.text() for item in class_tag.items('td')] sub_to_main_mapping.append((items_list[1], items_list[3])) return dict(sub_to_main_mapping) def update_to_mongo(self): fund_mapping = self.get_fund_mapping() for item in self._collection.query(kwargs={'code': 1}).sort([('_id', 1)]): key = item['code'][:6] main_fund_code = fund_mapping.get(key) if main_fund_code is not None: regex = re.compile(r'{0}'.format(main_fund_code)) main_fund_sid = self._collection.get({'code': regex}, {'sid': 1}) print 'main:', main_fund_sid _main = (main_fund_sid or {}).get('sid', '') self._collection.update({'_id': item['_id']}, setdata={'main': _main})
def rr_research_org_code(self, origin): coll = Mongodb('192.168.251.95', 27017, 'ada', 'rr_research_org') try: for doc in coll.query({'abbr.szh': {'$regex': origin}}): if doc['abbr']['szh'] == origin or origin in doc['rs']: return doc['code'] except Exception as e: print 'no get code by origin. Error:', e
def get_rr_research_org_code(origin): coll = Mongodb('192.168.251.95', 27017, 'ada', 'rr_research_org') try: for doc in coll.query({'abbr.szh': {'$regex': origin}}): if doc['abbr']['szh'] == origin or origin in doc['rs']: return doc['code'] except Exception as e: print 'no get code by origin. Error:', e
def __init__(self): self._regex_json_data = re.compile(r'(\{.*\})', re.S) self._rp_url = 'http://data.eastmoney.com{0}' self._title = re.compile(r'<h4>(.*?)<span', re.S) self._content = re.compile(r'<pre>(.*?)</pre>', re.S) self._collection = Mongodb('192.168.251.95', 27017, 'news', 'research_report_def') self._url = 'http://data.eastmoney.com/notice_n/reportHK.aspx?ajax=ajax&type=gs&page={0}&code=&jsname=&rt='
def remove_by_url(): """ 删除208 sum 字段有影响的文本记录, 通过 handle_163_text函数产生的文件中的url """ count = 0 verbose_dicts = {} files_path = 'D:/temp/data/20151119/' coll_208 = Mongodb('192.168.250.208', 27017, 'news', 'topnews_analyse') for _docs in coll_208.query({'url': { '$regex': re.compile(r'163\.com') }}, {'url': 1}): verbose_dicts[_docs['url']] = _docs['_id'] keys_set = {key for key in verbose_dicts} for _c, filename in enumerate(os.listdir(files_path), 1): with open(files_path + filename) as fp: url = fp.readline().strip() if url in keys_set: count += 1 object_id = verbose_dicts[url] coll_208.update({'_id': object_id}, setdata={'stat': 0}) print('Order: {}, count: {}, id: {}'.format( _c, count, object_id)) time.sleep(0.4) # break coll_208.disconnect()
def insert_db_from_file_secu(): def get_secu(code_string): secu = re.compile(r'\((\d+)\)').findall(code_string) # print 'se:', secu return secu[0] secu_keys = ['y', 'secu', 'price', 'amou', 'volu', 'buy', 'sale', 'ot'] coll_in = Mongodb('192.168.0.223', 27017, 'ada', 'base_block_trade') with open('d:/temp/secu_data_json.txt') as fd: for k, each in enumerate(fd): item = each.strip() if not item or item.startswith('#'): continue to_list = [] for j, s in enumerate(simplejson.loads(item)): if j == 1: tt = s.replace('\\28', '(').replace('\\29', ')') to_list.append(tt) else: to_list.append(s.decode('unicode-escape')) data = dict(zip(secu_keys, to_list)) data['s'] = data['secu'] data['secu'] = get_secu(data['secu']) data['volu'] = '{0:.2f}'.format(float(data['volu']) * 10000) data['amou'] = '{0:.2f}'.format(float(data['amou']) * 10000) data['typ'] = 'sha_secu' # print data data.pop('ot') coll_in.insert(data) print '{0} ok'.format(k + 1) coll_in.disconnect()
def filter_titles(db_path): month = str(date.today()).split('-')[1] year_mon = ''.join(str(date.today()).split('-')[:-1]) days = { '01': '31', '02': '28', '03': '31', '04': '30', '05': '31', '06': '30', '07': '31', '08': '31', '09': '30', '10': '31', '11': '30', '12': '31' } coll = Mongodb('192.168.0.212', 27017, 'arvin', 'finance_news_all') condition = { 'date': { '$gte': long(year_mon + '01000000'), '$lte': long(year_mon + days.get(month) + '232359') } } ######################################################## print 'db `title` is loading now, waiting .......' ######################################################## filedb = FileBsd('hash', db_path) for k, doc in enumerate(coll.query(condition)): try: filedb.put(md5(doc['title'])) except Exception as e: print 'filter_titles error:', e coll.disconnect() filedb.close() ##################################################### print 'title filter loading finished'
def update_item_sipo(): coll = Mongodb('192.168.0.223', 27017, 'py_crawl', 'sipo_typ') headers_fmgb = [ 'tit', 'sqgbh(申请公布号)', 'sqgbr(申请公布日)', 'sqh(申请号)', 'sqr_day(申请日)', 'sqr_person(申请人)', 'fmr(发明人)', 'dz(地址)', 'flh(分类号)', 'zy(摘要)', 'zldljg(专利代理机构)', 'dlr(代理人)', 'yxq(优先权)', 'PCTjrgjjdr(PCT进入国家阶段日)', 'PCTsqsj(PCT申请数据)', 'PCTgbsj(PCT公布数据)', 'gzwxcbr(更正文献出版日)', 'swbc(生物保藏)', 'faysq(分案原申请)', 'bgyxq(本国优先权)' ] headers_syxx = [ 'tit', 'sqggh(授权公告号)', 'sqggr(授权公告日)', 'sqh(申请号)', 'sqr_day(申请日)', 'zlqr(专利权人)', 'fmr(发明人)', 'dz(地址)', 'flh(分类号)', 'zy(摘要)', 'zldljg(专利代理机构)', 'dlr(代理人)', 'yxq(优先权)', 'PCTjrgjjdr(PCT进入国家阶段日)', 'PCTsqsj(PCT申请数据)', 'PCTgbsj(PCT公布数据)', 'gzwxcbr(更正文献出版日)', 'faysq(分案原申请)', 'bgyxq(本国优先权)' ] open_book_fmgb = XlsxWriter(path + 'sipo_fmgb.xlsx', 'fmgb', headers_fmgb) keys_fmsq = [ 'tit', 'sqgbh', 'sqgbr', 'sqh', 'sqr_day', 'sqr_person', 'fmr', 'dz', 'flh', 'zy', 'zldljg', 'dlr', 'yxq', 'PCTjrgjjdr', 'PCTsqsj', 'PCTgbsj', 'gzwxcbr', 'swbc', 'faysq', 'bgyxq' ] for k, dct in enumerate(coll.query({'type': 'fmgb'}).sort([('_id', 1)])): open_book_fmgb.write([dct.get(key, '') for key in keys_fmsq]) print 'fmgb:', k open_book_fmgb.close() open_book_syxx = XlsxWriter(path + 'sipo_syxx.xlsx', 'syxx', headers_fmgb) keys_syxx = [ 'tit', 'sqggh', 'sqggr', 'sqh', 'sqr_day', 'zlqr', 'fmr', 'dz', 'flh', 'zy', 'zldljg', 'dlr', 'yxq', 'PCTjrgjjdr', 'PCTsqsj', 'PCTgbsj', 'gzwxcbr', 'faysq', 'bgyxq' ] for ks, dct in enumerate(coll.query({'type': 'syxx'}).sort([('_id', 1)])): open_book_syxx.write([dct.get(key, '') for key in keys_syxx]) print 'syxx', ks open_book_syxx.close()
def filter_titles(db_path): month = str(date.today()).split('-')[1] year_mon = ''.join(str(date.today()).split('-')[:-1]) days = {'01': '31', '02': '28', '03': '31', '04': '30', '05': '31', '06': '30', '07': '31', '08': '31', '09': '30', '10': '31', '11': '30', '12': '31'} coll = Mongodb('192.168.0.212', 27017, 'arvin', 'finance_news_all') condition = {'date': {'$gte': long(year_mon + '01000000'), '$lte': long(year_mon + days.get(month) + '232359')}} ######################################################## print 'db `title` is loading now, waiting .......' ######################################################## filedb = FileBsd('hash', db_path) for k, doc in enumerate(coll.query(condition)): try: filedb.put(md5(doc['title'])) except Exception as e: print 'filter_titles error:', e coll.disconnect() filedb.close() ##################################################### print 'title filter loading finished'
def main(self, query=None): if query is None: query_date = [str(datetime.date.today())] else: query_date = query flag = False min_date = min(query_date) coll = Mongodb('192.168.251.95', 27017, 'news', 'research_report_def') url = 'http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?' query_string = 'type=SR&sty=GGSR&ps=50&p=%s&mkt=0&stat=0&cmd=2&code=&rt=' for page in range(1, 20): py_data = json.loads(self.get_html(url + query_string % str(page), encoding=True)[1:-1]) for data in py_data: code, agency = data['secuFullCode'][:6], data['insName'] date_time, url_info_code = data['datetime'][:10], data['infoCode'] report_url = 'http://data.eastmoney.com/report/%s/%s.html' % (date_time.replace('-', ''), url_info_code) if date_time in query_date: src = self.rr_research_org_code(agency) or '' # get src secu = self.base_stock_code(code) or '' # get secu if coll.get({'url': report_url}, {'titl': 1}) is None: try: now_html = self.get_html(report_url, encoding=True) title = self.remove_tag(self.__title.findall(now_html)[0]) content = self.remove_tag(self.__content.findall(now_html)[0]) to_data = { 'url': report_url, 'titl': {'szh': title, 'en': ''}, 'bio': {'en': '', 'szh': content}, 'rdt': date_time, 'upu': '', 'typ': '30001', 'stat': 1, 'upt': datetime.datetime.now(), 'crt': datetime.datetime.now(), } to_data.update({'src': src, 'secu': secu}) if not src or not secu: vn_src = '' if src else agency vn_secu = '' if secu else code to_data['vn'] = '^'.join([vn_src, vn_secu]) else: to_data['vn'] = None coll.insert(to_data) print '[%s %s FROM %s] -->>> Now insert mongodb!' % (code, date_time, agency) except Exception as e: print 'title: %s, url: %s' % (data['title'], report_url), 'Error:', e else: print '[%s %s FROM %s] -->>> mongodb table is existed' % (code, date_time, agency) elif date_time < min_date: flag = True break if flag: break coll.disconnect()
def csf_dict(): coll200 = Mongodb('192.168.250.208', 27017, 'news', 'csf_dict') ww = '送股实施公告 权益分派 分红派息 分红实施 转增股本 分派 OR 利润分配 OR 分配实施 OR 现金股利 OR ' \ '现金分红 OR 现金红利 OR 股息派发 NOT 调整非公开股票 NOT 调整发行股份 NOT 预案 NOT 预披露 NOT ' \ '管理制度 NOT 独立意见 NOT 法律意见书 NOT 预告 NOT 说明会 NOT 提示性公告 NOT 英文版 NOT 提议 ' \ 'NOT 临时公告 NOT 募集资金 NOT 完毕 NOT 调整发行股票价格' www = ww.replace('OR', ' ').replace('NOT', ' ') words = [w.strip() for w in www.split() if w.strip()] for wr in words: data = {'stat': 1, 'w': 1010, 'nat': 1, 'word': wr} coll200.insert(data) coll200.disconnect()
def remove_otc(): excel_path = 'D:/test/need_delete.xlsx' workbook = XlsxReader(excel_path) coll = Mongodb('122.144.134.95', 27017, 'news', 'announcement_otc') for ind, doc in enumerate(workbook.collection(_id=str), 1): # print doc['_id'], ind if ind >= 0: print doc['_id'], ind coll.update({'_id': ObjectId(doc['_id'])}, setdata={ 'stat': 0, 'upt': datetime.now() }) # break coll.disconnect()
def update_item(): conceptions = [] work_book = XlsxReader(path + 'www.xlsx') base_keys = ['conp', 'resc', 'cpcd', 'idxcd', 'rel'] for _k, doc in enumerate(work_book.collection(), 1): temp = {} temp['conp'] = doc['conp'] temp['cpcd'] = doc['cpcd'] temp['resc'] = [s.strip() for s in doc['resc'].split('&')] temp['rel'] = [s.strip() for s in doc['rel'].split(';')] temp['idxcd'] = [] if not doc['idxcd'].strip() else [ doc['idxcd'].strip() ] cw = [] for k, vs in doc.iteritems(): if k not in base_keys: cw.extend([v.strip() for v in vs if v.strip()]) temp['cw'] = cw print _k conceptions.append(temp) print 'read xlsx finished.' dicts = defaultdict(list) for dox in conceptions: dicts[dox['conp']].extend(dox['idxcd']) for _temp_data in conceptions: conp = _temp_data['conp'] _temp_data['idxcd'] = dicts[conp] coll = Mongodb('192.168.250.208', 27017, 'news', 'news_conp') for data in conceptions: coll.insert(data) coll.disconnect()
def update(): coll_in = Mongodb('192.168.251.95', 27017, 'news', 'announcement_hk_chz') coll_cat = Mongodb('192.168.251.95', 27017, 'ada', 'dict_announce_catalog_hk') coll_secu = Mongodb('192.168.251.95', 27017, 'ada', 'base_stock') count = 0 for code, query in codes_date: ktt = 0 count += 1 validate(code, query) print '[%s-->>%s,%s]' % (count, code, query), ':waiting few minutes......\n' dctu = PoskUpdate(code, query).main() # codes, date, cat, title, url for codes, dt, cat, title, url, cat_origin in dctu: ktt += 1 print '\t[%s ->> ktt:%s]' % (code, ktt), '|', codes, '|', dt, '|', title, '|', url for code_ in codes: secu = get_secu(code_, coll_secu) print 'secu:', secu if secu and not coll_in.get({'sid': url}, {'title': 1}): try: hk_data = post_dict(secu, dt, cat, title, url, cat_origin, coll_cat) coll_in.insert(hk_data) except Exception as e: print '\t[%s] |%s|upload error: %s!' % (code_, dt, e.message) # inds_mon = coll_in.get({'sid': url}, {'title': 1}) # ind_url = "http://192.168.250.205:17081/indexer/services/indexes/delta.json?" \ # "indexer=announce_hkz&taskids=" # if inds_mon: # 创建索引 # jdata = BaseDownloadHtml().get_html(ind_url + str(inds_mon['_id']))[0] # if json.loads(jdata)['code'] == 200: # print '\tcreate index is ok!\n\n' if ktt % 80 == 0: sleep(2 * 60) coll_in.disconnect() coll_cat.disconnect() coll_secu.disconnect()
def main(self): if not self._validity: print 'SZX this is Saturday or Monday!' return 0 coll_in = Mongodb('192.168.251.95', 27017, 'ada', 'base_margin_trading') coll_stock = Mongodb('192.168.251.95', 27017, 'ada', 'base_stock') coll_fund = Mongodb('192.168.251.95', 27017, 'fund', 'base_fund') url = 'http://www.szse.cn/szseWeb/FrontController.szse?randnum=&' t = lambda v: '%.4f' % float(v) for page in range(1, 30): break_point = False html = self.get_html( url + self._query_string.format(self._query_date, page), encoding=True) for it in self.extract(html): # print it[0], it[1], it[2], it[3], it[4], it[5], it[6] break_point = True secu_cd = secu_code(it[0], coll_stock, coll_fund) fiba_bre = szx_fiba_bre(secu_cd, coll_in, self._query_date) sema_bre = szx_sema_bre(secu_cd, coll_in, self._query_date) # 本日融资偿还额 = 前日融资余额 + 本日融资买入- 本日融资余额(元) (fi.re = fi.ba(上期) + fi.bu - fi.ba) # 融券偿还量 = 融券卖出量 + 融券余量(上期) - 融券余量 (se.re = se.so + se.ma(上期) - se.ma) szx_fs_data = { 'secu': secu_cd or it[0], 'date': self._query_date, 'total': t(it[6]), 'stat': 2, 'typ': 'szx', 'crt': datetime.now(), 'fi': { 'ba': t(it[2]), 'bu': t(it[1]), 're': t(float(it[1]) + fiba_bre - float(it[2])) }, 'se': { 'ba': t(it[5]), 'ma': t(it[4]), 'so': t(it[3]), 're': t(float(it[3]) + sema_bre - float(it[4])) }, 'upt': datetime.now() } print szx_fs_data if not coll_in.get({ 'secu': secu_cd or it[0], 'date': self._query_date, 'typ': 'szx' }): print coll_in.insert(szx_fs_data) if not break_point: break print u'szx [%s] 融资融券交易明细 day update: %d page done!' % ( self._query_date, page) # break coll_in.disconnect() coll_stock.disconnect() coll_fund.disconnect()
def insert_db(self, total_data): coll_in = Mongodb('192.168.251.95', 27017, 'ada', 'base_margin_trading') coll_stock = Mongodb('192.168.251.95', 27017, 'ada', 'base_stock') coll_fund = Mongodb('192.168.251.95', 27017, 'fund', 'base_fund') sql_db = MySQLClient("192.168.251.95", "python_team", "python_team", "ada-fd") print '\tnow start to insert mongodb, waiting......' d = (lambda v: '%.4f' % float(v)) for pdt in total_data: # 信用交易日期 标的证券代码 标的证券简称 本日融资余额(元) 本日融资买入额(元) # 本日融资偿还额(元) 本日融券余量 本日融券卖出量 本日融券偿还量 secu_cd = secu_code(pdt[1], coll_stock, coll_fund) trade_date = '-'.join([pdt[0][:4], pdt[0][4:6], pdt[0][6:]]) uid = str(uuid.uuid5(uuid.NAMESPACE_DNS, ''.join(self._valid(pdt)).encode('u8'))) data = { 'secu': secu_cd or pdt[1], 'date': trade_date, 'total': d(int(pdt[3])), 'stat': 2, 'typ': 'sha', 'crt': datetime.now(), 'uuid': uid, 'fi': { 'ba': d(pdt[3]), 'bu': d(pdt[4]), 're': d(pdt[5]) }, 'se': { 'ba': '0.0000', 'ma': d(pdt[6]), 'so': d(pdt[7]), 're': d(pdt[8]) }, 'upt': datetime.now() } if coll_in.get({'uuid': uid, 'typ': 'sha'}, {'secu': 1}): continue elif secu_cd is None: coll_in.insert(data) else: seba = sha_seba(secu_cd, pdt[6], trade_date, sql_db) if seba is not None: data['total'] = d(int(pdt[3]) + seba) data['se']['ba'] = d(seba) coll_in.insert(data) coll_in.disconnect() coll_stock.disconnect() sql_db.disconnect() print '\tinsert all done!'
def csf_news(): coll200 = Mongodb('192.168.250.208', 27017, 'news', 'new_keyword_dict') coll_csf = Mongodb('192.168.250.208', 27017, 'news', 'csf_dict') for k, doc in enumerate(coll200.query(), 1): word = doc['word'] coll_csf.insert({'word': word, 'nat': 0, 'stat': 2, 'w': 1000}) print k coll200.disconnect() coll_csf.disconnect()
if url in keys_set: count += 1 object_id = verbose_dicts[url] coll_208.update({'_id': object_id}, setdata={'stat': 0}) print('Order: {}, count: {}, id: {}'.format( _c, count, object_id)) time.sleep(0.4) # break coll_208.disconnect() host = '192.168.100.20' port = 27017 coll = Mongodb(host, port, 'opt', 'test') def temp(): mapping = { '美股新闻': {'美股新闻', 'us_gg', 'us_hy', 'us_hg', 'us_gs', '美股个股', 'us'}, '基金新闻': {'基金新闻'}, '新三板': {'新三板'}, '港股新闻': {'港股新闻', 'hk'}, 'A股新闻': { '公司新闻', '股评新闻', '宏观新闻', ' 宏观新闻', '行业新闻', ' 行业新闻', '热点新闻', 'hjd', 'hot', 'test', '政策新闻', 'hotnews', '私募投资' }, } client = MongoClient('192.168.100.20') coll_20 = client.news_crawl.finance_news_all_org
base_url = 'http://www.szse.cn/szseWeb/FrontController.szse?randnum=&' query_string = 'ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1265_xyjy&TABKEY=tab1&tab1PAGECOUNT={0}&' \ 'tab1RECORDCOUNT={1}&REPORT_ACTION=navigate&tab1PAGENUM={2}' bond_string = 'ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=dzjy_xyjy&TABKEY=tab1&tab1PAGECOUNT={0}&' \ 'tab1RECORDCOUNT={1}&REPORT_ACTION=navigate&tab1PAGENUM={2}' user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36' referer = 'http://www.sse.com.cn/disclosure/diclosure/block/deal/' stock_fund_url = 'http://query.sse.com.cn/commonQuery.do?&jsonCallBack=&isPagination=true&' \ 'sqlId=COMMON_SSE_XXPL_JYXXPL_DZJYXX_L_1&stockId=&startDate=%s&endDate=%s&' \ 'pageHelp.pageSize=15&pageHelp.pageNo=1&pageHelp.beginPage=%s&pageHelp.endPage=5&' \ 'pageHelp.cacheSize=1' bond_url = 'http://query.sse.com.cn/commonQuery.do?&jsonCallBack=&isPagination=true&' \ 'sqlId=COMMON_SSE_XXPL_JYXXPL_DZJYXX_L_2&stockId=&startDate=%s&endDate=%s&' \ 'pageHelp.pageSize=15&pageHelp.pageNo=1&pageHelp.beginPage=%s&pageHelp.endPage=5&' \ 'pageHelp.cacheSize=1' coll_in = Mongodb('192.168.251.95', 27017, 'ada', 'base_block_trade') coll_stock = Mongodb('192.168.251.95', 27017, 'ada', 'base_stock') coll_fund = Mongodb('192.168.251.95', 27017, 'fund', 'base_fund') coll_bond = Mongodb('192.168.251.95', 27017, 'ada', 'base_bond') coll_vary = Mongodb('192.168.251.95', 27017, 'ada', 'base_share_vary') mysql = MySQLClient("192.168.251.95", "python_team", "python_team", "ada-fd") sha_command_history = r'casperjs D:\project\autumn\crawler\block_trade\block_trade_with_date.js ' \ r'--st_date={0} --ed_date={1} --outfile={0}' sha_command_update = r'casperjs block_trade.js'
def third_update(): coll_in = Mongodb('192.168.251.95', 27017, 'news', 'announcement_hk_chz') coll_cat = Mongodb('192.168.251.95', 27017, 'ada', 'dict_announce_catalog_hk') coll_secu = Mongodb('192.168.251.95', 27017, 'ada', 'base_stock') kt = 0 cdctuo = ThirdUpdate().main() cd_dt_cat_tit_url_ori = cdctuo if cdctuo else [] for codes, dt, cat, title, url, cat_origin in cd_dt_cat_tit_url_ori: kt += 1 for code in codes: secu = get_secu(code, coll_secu) if secu and not coll_in.get({'sid': url, 'secu.0.cd': secu[0]['cd']}, {'title': 1}): print 'kt:', kt, '|', code, '|', dt, '|', url, '\n|', title try: hk_data = post_dict(secu, dt, cat, title, url, cat_origin, coll_cat) coll_in.insert(hk_data) except Exception as e: print 'Error:', e.message # 创建索引 # inds_mon = coll_in.get({'sid': url}, {'title': 1}) # ind_url = "http://192.168.250.205:17081/indexer/services/indexes/delta.json?" \ # "indexer=announce_hkz&taskids=" # if inds_mon: # jdata = BaseDownloadHtml().get_html(ind_url + str(inds_mon['_id']))[0] # if json.loads(jdata)['code'] == 200: # print '\tcreate index is ok!\n\n' coll_in.disconnect() coll_cat.disconnect() coll_secu.disconnect()
def __init__(self): self._collection = Mongodb('192.168.250.200', 27017, 'fund', 'base_fund') self._url = 'http://fund.csrc.gov.cn/web/classification_show.organization'
def statistics(months=None, weeks=None, days=None): if months: query_range = str(datetime.now() - timedelta(days=30)).replace( '-', '')[:8] elif weeks: query_range = str(datetime.now() - timedelta(days=7)).replace('-', '')[:8] elif days: pass coll_from = Mongodb('192.168.250.208', 27017, 'news', 'hotnews_analyse') coll_to = Mongodb('192.168.250.208', 27017, 'news', 'statistics') all_ind = { _ind for _doc in coll_from.query(kwargs={'ind': 1}) for _ind in _doc.get('ind', []) } for ind in all_ind: counter = Counter() query_cond = { 'ind': { '$in': [ind] }, 'dt': { '$gte': query_range + '000000' } } for doc in coll_from.query(query_cond, {'kw': 1}): counter.update(doc.get('kw', [])) data = { 'ind': ind, 'count': counter.most_common(100), 'dt': query_range } coll_to.insert(data) coll_from.disconnect() coll_to.disconnect()
def __init__(self, typ): self._typ = typ self._sipo_datas = [] self._mongodb = Mongodb('192.168.0.223', 27017, 'py_crawl', 'sipo_typ') self._url_with_query_string = query_string.get_url_with_query_string( self._typ)
class SipoSeparate(object): def __init__(self, typ): self._typ = typ self._sipo_datas = [] self._mongodb = Mongodb('192.168.0.223', 27017, 'py_crawl', 'sipo_typ') self._url_with_query_string = query_string.get_url_with_query_string( self._typ) def extract(self, page): checked = lambda t: re.compile(r'\(\d{4}\.\d\d\)', re.S).search(t) url = self._url_with_query_string.format(page=page) html_pyq = download.RequestHtml().get_html(url) document = PyQuery(html_pyq) for each_node in document.items('.cp_linr'): each_node.remove('a') data = {'tit': each_node('h1').text(), 'type': self._typ} data.update(self.initial_value(each_node( '.cp_jsh').text())) # obtain zhaiyao to being key and value for k, node_li in enumerate( each_node('.cp_linr > ul > li').items()): if node_li('li').length == 1: data.update(self.initial_value(node_li.text())) else: # handle to multi elements of li tag, sometimes content of child li tag is its parents. flh_flag = False # assure whether have multi flh, default yes. parent_node_li_text = '' for child_li in node_li.items('li'): if child_li('li').length > 1: flh_flag = True parent_node_li_text += child_li.remove('li').text() else: if flh_flag: flh_flag = False child_li_text = child_li.text() if checked(child_li_text): # `flh_flag` is true, judge `parent_node_li_text` whether has `flh`, parent_node_li_text += child_li_text else: # and must deal with `child_li_text` to update when hasn't child `flh` data.update( self.initial_value(child_li_text)) data.update( self.initial_value(parent_node_li_text)) else: data.update(self.initial_value( child_li.text())) self._sipo_datas.append(data) @staticmethod def initial_value(string): if not string.strip(): return {} key_value = lambda t: re.compile(r'(.*?):(.*)', re.S).findall(t) try: key, value = key_value(string)[0] init_py = pinyin.get_initial(key, delimiter='') if init_py == 'sqr': if pinyin.get(key, delimiter='').endswith('ren'): init_py = '_'.join((init_py, 'person')) else: init_py = '_'.join((init_py, 'day')) return dict(((init_py, value), )) except IndexError: pass return {} def insert_mongo(self, iterable): pool = ThreadPool(8) pool.map(self.extract, iterable) pool.close() pool.join() # now insert to mongodb at 192.168.0.233 for mon_data in self._sipo_datas: self._mongodb.insert(mon_data) del self._sipo_datas[:] def main(self): unit = 100 pages_list = range( 1, getattr(query_string, '_'.join(['num', self._typ])) + 1) pagination = len(pages_list) / unit + ((len(pages_list) % unit) and 1) dummy_pages_list = [ pages_list[p * unit:(p + 1) * unit] for p in range(pagination) ] for k, dummy_page in enumerate(dummy_pages_list): # print '\t%s: %s' % (self._typ, dummy_page) print 'Now executing [ {1}:{0}] times, from {2} to {3}.'.format( k + 1, self._typ, k * unit + 1, (k + 1) * unit) self.insert_mongo(dummy_page) print '->>>[ {1}: {0} ] times execute is ok, will sleeping 35seconds.\n'.format( k + 1, self._typ) time.sleep(30) break
def main(self): if not self._validity: print "SZX this is Saturday or Monday!" return 0 coll_in = Mongodb("192.168.251.95", 27017, "ada", "base_margin_trading") coll_stock = Mongodb("192.168.251.95", 27017, "ada", "base_stock") coll_fund = Mongodb("192.168.251.95", 27017, "fund", "base_fund") url = "http://www.szse.cn/szseWeb/FrontController.szse?randnum=&" t = lambda v: "%.4f" % float(v) for page in range(1, 30): break_point = False html = self.get_html(url + self._query_string.format(self._query_date, page), encoding=True) for it in self.extract(html): # print it[0], it[1], it[2], it[3], it[4], it[5], it[6] break_point = True secu_cd = secu_code(it[0], coll_stock, coll_fund) fiba_bre = szx_fiba_bre(secu_cd, coll_in, self._query_date) sema_bre = szx_sema_bre(secu_cd, coll_in, self._query_date) # 本日融资偿还额 = 前日融资余额 + 本日融资买入- 本日融资余额(元) (fi.re = fi.ba(上期) + fi.bu - fi.ba) # 融券偿还量 = 融券卖出量 + 融券余量(上期) - 融券余量 (se.re = se.so + se.ma(上期) - se.ma) szx_fs_data = { "secu": secu_cd or it[0], "date": self._query_date, "total": t(it[6]), "stat": 2, "typ": "szx", "crt": datetime.now(), "fi": {"ba": t(it[2]), "bu": t(it[1]), "re": t(float(it[1]) + fiba_bre - float(it[2]))}, "se": { "ba": t(it[5]), "ma": t(it[4]), "so": t(it[3]), "re": t(float(it[3]) + sema_bre - float(it[4])), }, "upt": datetime.now(), } print szx_fs_data if not coll_in.get({"secu": secu_cd or it[0], "date": self._query_date, "typ": "szx"}): print coll_in.insert(szx_fs_data) if not break_point: break print u"szx [%s] 融资融券交易明细 day update: %d page done!" % (self._query_date, page) # break coll_in.disconnect() coll_stock.disconnect() coll_fund.disconnect()
URL = 'http://www.szse.cn/main/disclosure/jgxxgk/djggfbd/' base_url = 'http://www.szse.cn/szseWeb/FrontController.szse?randnum=&' query_string_by_date = 'ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1801_cxda&TABKEY=tab1&selectGsbk=&txtDMorJC=&' \ 'txtGgxm=&txtStart={0}&txtEnd={1}&REPORT_ACTION=search' query_string_update = 'ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1801_cxda&TABKEY=tab1&tab1PAGECOUNT={0}&' \ 'tab1RECORDCOUNT={1}&REPORT_ACTION=navigate&tab1PAGENUM={2}' query_string_history = 'ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1801_cxda&txtStart={0}&txtEnd={1}&TABKEY=tab1&' \ 'tab1PAGECOUNT={2}&tab1RECORDCOUNT={3}&REPORT_ACTION=navigate&tab1PAGENUM={4}' referer = 'http://www.sse.com.cn/disclosure/credibility/change/' user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36' sha_query_string = 'http://query.sse.com.cn/commonQuery.do?&jsonCallBack=&isPagination=true&' \ 'sqlId=COMMON_SSE_XXPL_CXJL_SSGSGFBDQK_S&COMPANY_CODE=&NAME=&BEGIN_DATE=%s&END_DATE=%s&' \ 'pageHelp.pageSize=15&pageHelp.pageNo=1&pageHelp.beginPage=%s&pageHelp.cacheSize=1&' \ 'pageHelp.endPage=5' sha_updating_of_day = 'http://query.sse.com.cn/commonQuery.do?&jsonCallBack=&isPagination=true&' \ 'sqlId=COMMON_SSE_XXPL_CXJL_SSGSGFBDQK_S&pageHelp.pageSize=15&pageHelp.pageNo=%s&' \ 'pageHelp.beginPage=%s&pageHelp.cacheSize=1&pageHelp.endPage=%s1' coll_in = Mongodb('192.168.251.95', 27017, 'ada', 'base_executive_regulation') coll_stock = Mongodb('192.168.251.95', 27017, 'ada', 'base_stock') coll_exec = Mongodb('192.168.251.95', 27017, 'ada', 'base_executive') coll_vary = Mongodb('192.168.251.95', 27017, 'ada', 'base_share_vary') coll_curr = Mongodb('192.168.251.95', 27017, 'ada', 'dict_currency')
def __init__(self, typ): self._typ = typ self._sipo_datas = [] self._sipo_form = form_data.get_form_data(self._typ) self._mongodb = Mongodb('192.168.0.223', 27017, 'py_crawl', 'sipo')