def remove_by_url(): """ 删除208 sum 字段有影响的文本记录, 通过 handle_163_text函数产生的文件中的url """ count = 0 verbose_dicts = {} files_path = 'D:/temp/data/20151119/' coll_208 = Mongodb('192.168.250.208', 27017, 'news', 'topnews_analyse') for _docs in coll_208.query({'url': { '$regex': re.compile(r'163\.com') }}, {'url': 1}): verbose_dicts[_docs['url']] = _docs['_id'] keys_set = {key for key in verbose_dicts} for _c, filename in enumerate(os.listdir(files_path), 1): with open(files_path + filename) as fp: url = fp.readline().strip() if url in keys_set: count += 1 object_id = verbose_dicts[url] coll_208.update({'_id': object_id}, setdata={'stat': 0}) print('Order: {}, count: {}, id: {}'.format( _c, count, object_id)) time.sleep(0.4) # break coll_208.disconnect()
def get_base_stock_code(stock_code): coll = Mongodb('192.168.251.95', 27017, 'ada', 'base_stock') try: for d in coll.query({'tick': stock_code}).sort([('crt', 1)]): return d.get('code') except Exception as e: print 'no get code by base_stock. Error:', e
def base_stock_code(self, stock_code): coll = Mongodb('192.168.251.95', 27017, 'ada', 'base_stock') try: for d in coll.query({'tick': stock_code}).sort([('crt', 1)]): return d.get('code') except Exception as e: print 'no get code by base_stock. Error:', e
class FundMapping(object): def __init__(self): self._collection = Mongodb('192.168.250.200', 27017, 'fund', 'base_fund') self._url = 'http://fund.csrc.gov.cn/web/classification_show.organization' def get_fund_mapping(self): # sub_code, sub_name, main_code, main_name sub_to_main_mapping = [] html = requests.get(self._url, timeout=30.0).content document = PyQuery(unicode(html, 'utf-8')) fund_blocks = [document.items('.aa'), document.items('.dd')] for each_block in fund_blocks: for class_tag in each_block: items_list = [item.text() for item in class_tag.items('td')] sub_to_main_mapping.append((items_list[1], items_list[3])) return dict(sub_to_main_mapping) def update_to_mongo(self): fund_mapping = self.get_fund_mapping() for item in self._collection.query(kwargs={'code': 1}).sort([('_id', 1)]): key = item['code'][:6] main_fund_code = fund_mapping.get(key) if main_fund_code is not None: regex = re.compile(r'{0}'.format(main_fund_code)) main_fund_sid = self._collection.get({'code': regex}, {'sid': 1}) print 'main:', main_fund_sid _main = (main_fund_sid or {}).get('sid', '') self._collection.update({'_id': item['_id']}, setdata={'main': _main})
def handle_163_text(): """ 找出208 sum 字段有影响的文本记录 """ pattern = re.compile(r'网易财经会赚钱的客户端|网易财经 会赚钱的客户端') query_cond = {'url': {'$regex': re.compile(r'163\.com')}, 'ratio': 0} coll = Mongodb('192.168.0.223', 27017, 'news_crawl', 'hot_news') # print coll.query({'content': {'$regex': re.compile(r'%s' % text)}}).count() for k, doc in enumerate(coll.query(query_cond), 1): content = doc['content'] if pattern.search(content) is not None: url = doc['url'] title = doc['title'].split('重点推荐', 1)[0] auth = doc['author'] pub_date = doc['date'] cat = doc.get('source') ratio = doc.get('ratio') new_content = pattern.split(content, 1)[0].split('div,h1', 1)[0].strip('#&# ') if cat and new_content: lines = [ url, title, auth, str(pub_date), new_content, cat, str(ratio) ] write(path + '20151119/', str(pub_date), lines) print 'id:', doc['_id'], k coll.disconnect()
def get_rr_research_org_code(origin): coll = Mongodb('192.168.251.95', 27017, 'ada', 'rr_research_org') try: for doc in coll.query({'abbr.szh': {'$regex': origin}}): if doc['abbr']['szh'] == origin or origin in doc['rs']: return doc['code'] except Exception as e: print 'no get code by origin. Error:', e
def rr_research_org_code(self, origin): coll = Mongodb('192.168.251.95', 27017, 'ada', 'rr_research_org') try: for doc in coll.query({'abbr.szh': {'$regex': origin}}): if doc['abbr']['szh'] == origin or origin in doc['rs']: return doc['code'] except Exception as e: print 'no get code by origin. Error:', e
def csf_news(): coll200 = Mongodb('192.168.250.208', 27017, 'news', 'new_keyword_dict') coll_csf = Mongodb('192.168.250.208', 27017, 'news', 'csf_dict') for k, doc in enumerate(coll200.query(), 1): word = doc['word'] coll_csf.insert({'word': word, 'nat': 0, 'stat': 2, 'w': 1000}) print k coll200.disconnect() coll_csf.disconnect()
def statistics(months=None, weeks=None, days=None): if months: query_range = str(datetime.now() - timedelta(days=30)).replace( '-', '')[:8] elif weeks: query_range = str(datetime.now() - timedelta(days=7)).replace('-', '')[:8] elif days: pass coll_from = Mongodb('192.168.250.208', 27017, 'news', 'hotnews_analyse') coll_to = Mongodb('192.168.250.208', 27017, 'news', 'statistics') all_ind = { _ind for _doc in coll_from.query(kwargs={'ind': 1}) for _ind in _doc.get('ind', []) } for ind in all_ind: counter = Counter() query_cond = { 'ind': { '$in': [ind] }, 'dt': { '$gte': query_range + '000000' } } for doc in coll_from.query(query_cond, {'kw': 1}): counter.update(doc.get('kw', [])) data = { 'ind': ind, 'count': counter.most_common(100), 'dt': query_range } coll_to.insert(data) coll_from.disconnect() coll_to.disconnect()
def update_item_sipo(): coll = Mongodb('192.168.0.223', 27017, 'py_crawl', 'sipo_typ') headers_fmgb = [ 'tit', 'sqgbh(申请公布号)', 'sqgbr(申请公布日)', 'sqh(申请号)', 'sqr_day(申请日)', 'sqr_person(申请人)', 'fmr(发明人)', 'dz(地址)', 'flh(分类号)', 'zy(摘要)', 'zldljg(专利代理机构)', 'dlr(代理人)', 'yxq(优先权)', 'PCTjrgjjdr(PCT进入国家阶段日)', 'PCTsqsj(PCT申请数据)', 'PCTgbsj(PCT公布数据)', 'gzwxcbr(更正文献出版日)', 'swbc(生物保藏)', 'faysq(分案原申请)', 'bgyxq(本国优先权)' ] headers_syxx = [ 'tit', 'sqggh(授权公告号)', 'sqggr(授权公告日)', 'sqh(申请号)', 'sqr_day(申请日)', 'zlqr(专利权人)', 'fmr(发明人)', 'dz(地址)', 'flh(分类号)', 'zy(摘要)', 'zldljg(专利代理机构)', 'dlr(代理人)', 'yxq(优先权)', 'PCTjrgjjdr(PCT进入国家阶段日)', 'PCTsqsj(PCT申请数据)', 'PCTgbsj(PCT公布数据)', 'gzwxcbr(更正文献出版日)', 'faysq(分案原申请)', 'bgyxq(本国优先权)' ] open_book_fmgb = XlsxWriter(path + 'sipo_fmgb.xlsx', 'fmgb', headers_fmgb) keys_fmsq = [ 'tit', 'sqgbh', 'sqgbr', 'sqh', 'sqr_day', 'sqr_person', 'fmr', 'dz', 'flh', 'zy', 'zldljg', 'dlr', 'yxq', 'PCTjrgjjdr', 'PCTsqsj', 'PCTgbsj', 'gzwxcbr', 'swbc', 'faysq', 'bgyxq' ] for k, dct in enumerate(coll.query({'type': 'fmgb'}).sort([('_id', 1)])): open_book_fmgb.write([dct.get(key, '') for key in keys_fmsq]) print 'fmgb:', k open_book_fmgb.close() open_book_syxx = XlsxWriter(path + 'sipo_syxx.xlsx', 'syxx', headers_fmgb) keys_syxx = [ 'tit', 'sqggh', 'sqggr', 'sqh', 'sqr_day', 'zlqr', 'fmr', 'dz', 'flh', 'zy', 'zldljg', 'dlr', 'yxq', 'PCTjrgjjdr', 'PCTsqsj', 'PCTgbsj', 'gzwxcbr', 'faysq', 'bgyxq' ] for ks, dct in enumerate(coll.query({'type': 'syxx'}).sort([('_id', 1)])): open_book_syxx.write([dct.get(key, '') for key in keys_syxx]) print 'syxx', ks open_book_syxx.close()
def filter_titles(db_path): month = str(date.today()).split('-')[1] year_mon = ''.join(str(date.today()).split('-')[:-1]) days = { '01': '31', '02': '28', '03': '31', '04': '30', '05': '31', '06': '30', '07': '31', '08': '31', '09': '30', '10': '31', '11': '30', '12': '31' } coll = Mongodb('192.168.0.212', 27017, 'arvin', 'finance_news_all') condition = { 'date': { '$gte': long(year_mon + '01000000'), '$lte': long(year_mon + days.get(month) + '232359') } } ######################################################## print 'db `title` is loading now, waiting .......' ######################################################## filedb = FileBsd('hash', db_path) for k, doc in enumerate(coll.query(condition)): try: filedb.put(md5(doc['title'])) except Exception as e: print 'filter_titles error:', e coll.disconnect() filedb.close() ##################################################### print 'title filter loading finished'
def filter_titles(db_path): month = str(date.today()).split('-')[1] year_mon = ''.join(str(date.today()).split('-')[:-1]) days = {'01': '31', '02': '28', '03': '31', '04': '30', '05': '31', '06': '30', '07': '31', '08': '31', '09': '30', '10': '31', '11': '30', '12': '31'} coll = Mongodb('192.168.0.212', 27017, 'arvin', 'finance_news_all') condition = {'date': {'$gte': long(year_mon + '01000000'), '$lte': long(year_mon + days.get(month) + '232359')}} ######################################################## print 'db `title` is loading now, waiting .......' ######################################################## filedb = FileBsd('hash', db_path) for k, doc in enumerate(coll.query(condition)): try: filedb.put(md5(doc['title'])) except Exception as e: print 'filter_titles error:', e coll.disconnect() filedb.close() ##################################################### print 'title filter loading finished'