def find_repeat_news(): """ repeat 1 重复 -1 不重复 :return: """ collection = Mongo().news news = list(collection.find({'created_at': {'$gt': time.time() - 3600}})) for new in news: new['keywords_temp'] = get_keywords(new['content']) for new1 in news: if new1.get('repeat'): new1['state'] = 1 continue for new2 in news: if new2['_id'] == new1['_id']: continue if is_sim(new1, new2): new1['repeat'] = 1 new1['state'] = 1 break if new1.get('state') != 1: new1['state'] = 1 new1['repeat'] = -1 for new in news: new.pop('state') new.pop('keywords_temp') collection.save(new)
def get_github_data(token_name): collection = Mongo().github try: result = collection.find({'token_name': token_name}).sort('spider_time', pymongo.DESCENDING).limit(1)[0] result.pop('_id') return return_success(data=result) except Exception as e: return return_error()
def statistic_token_address(token_name): collection = Mongo().token_address current_info = collection.find({ 'token_name': token_name }).sort('time', pymongo.DESCENDING).limit(1) if current_info.count() == 0: return False, False, False current_info = current_info[0] last_info = collection.find({ 'time': {'$gt': current_info['time'] - 86400}, 'token_name': token_name }).sort('time', pymongo.ASCENDING).limit(1)[0] if current_info['address'] - last_info['address'] == 0: last_info = collection.find({ 'time': {'$lte': current_info['time'] - 86400}, 'token_name': token_name }).sort('time', pymongo.DESCENDING).limit(1)[0] return True, current_info['address'], current_info['address'] - last_info['address']
def start_spider(self): result = None try: result = requests.get(conf['news']['bishijie']).json() collection = Mongo().news if result['error'] != 0: self.retry() result = result['data'] for date in result: id_list = [new['newsflash_id'] for new in result[date]['buttom']] db_news = collection.find({ 'spider_from': 'bishijie', 'source_id': { '$in': id_list } }) db_id_list = [new['source_id'] for new in db_news] for new in result[date]['buttom']: if new['newsflash_id'] in db_id_list: continue content = new['content'] try: front_title_index = content.index('【') tail_title_index = content.index('】') title = content[front_title_index + 1:tail_title_index] content = content[tail_title_index + 1:] except Exception as e: title = '' insert_data = { 'type': 'news', 'created_at': new['issue_time'], 'author': new['source'], 'spider_from': 'bishijie', 'source': 'bishijie', 'source_id': new['newsflash_id'], 'title': title, 'content': content, 'url': 'http://www.bishijie.com/home/newsflashpc/detail?id=' + str(new['newsflash_id']), 'images': [], 'keywords': [], 'has_send': 0 } collection.insert(insert_data) except Exception as e: self.retry(e)
def deal_content(): collection = Mongo().news news = collection.find({'has_keywords': {'$ne': 1}}) for new in news: if not new['title'] or not new['content']: continue text = new['title'] + ';' + new['content'] keywords = get_keywords(text) new.update({'keywords': keywords, 'has_keywords': 1}) collection.save(new)
def sync_google_trends(): collection = Mongo().google_trends trends = collection.find({}) for trend in trends: if 'trends' not in trend or not trend['trends']: continue post_data = { 'token_id': trend['token_id'], 'search_number': trend['trends'][-1]['value'][0] } send_google_trend.delay(post_data)
def start_spider(): collection = Mongo().news # html = requests.get('http://www.jinse.com/lives').text dom = PyQuery(url='https://wallstreetcn.com/live/blockchain') pane = dom(".wscn-tab-pane") items = pane.items() next(items) pane = next(items) lives = pane('.live-item') for li in lives.items(): source_id = None content = li('.live-item__main__content')('p').text() if not content: continue content_more = li('.live-item__main__content-more')('p').html() try: front_title_index = content.index('【') tail_title_index = content.index('】') title = content[front_title_index + 1: tail_title_index] content = content[tail_title_index + 1:] except Exception as e: title = '' if content_more: content += content_more images = [] images_items = li('.live-item__main__images')('.zoomer__img') for image in images_items.items(): images.append(image.attr('src')) # 查询记录是否已经存在 db_count = collection.find({ 'spider_from': 'wallstreetcn', 'content': content }).count() if db_count > 0: continue insert_data = { 'type': 'news', 'created_at': int(time.time()), 'author': "华尔街见闻", 'spider_from': 'wallstreetcn', 'source': 'wallstreetcn', 'source_id': -1, 'title': title, 'content': content, 'url': '', 'images': [], 'keywords': [], 'has_send': 0 } collection.insert(insert_data) return True
def cryptopanic_spider(): collection = Mongo().news news = get_cryptopanic() if not news: return False for new in news: source_id = new['pk'] db_count = collection.find({ 'spider_from': 'cryptopanic', 'source_id': source_id }).count() if db_count > 0: continue title, content = new.get('title'), html2text(new.get('body')) title_cn, content_cn = google_translate_list([title, content]) insert_data = { 'type': new['kind'], 'created_at': int(time.time()), 'author': new.get('domain'), 'spider_from': 'cryptopanic', 'source': new['source']['domain'], 'source_id': source_id, 'title': new.get('title'), 'content': html2text(new.get('body')), 'url': new.get('url'), 'images': new.get('image'), 'has_keywords': 0, 'has_send': 0, 'repeat': -1, 'has_translated': 1, 'translated_title': title_cn, 'translated_text': content_cn } currencies = new.get('currencies') if currencies: for currencie in new['currencies']: insert_data.setdefault('keywords', []).append(currencie['code']) insert_data['has_keywords'] = 1 collection.insert(insert_data)
def start_spider(): collection = Mongo().news data = requests.get( 'https://api.jinse.com/v4/live/list?limit=20&reading=false') for date in data.json()['list']: for new in date['lives']: source_id = new['id'] content = new['content'] # 查询记录是否已经存在 db_count = collection.find({ 'spider_from': 'jinse', 'source_id': source_id }).count() if db_count > 0: continue try: front_title_index = content.index('【') tail_title_index = content.index('】') title = content[front_title_index + 1:tail_title_index] content = content[tail_title_index + 1:] except Exception as e: title = '' insert_data = { 'type': 'news', 'created_at': int(time.time()), 'author': "金色快讯", 'spider_from': 'jinse', 'source': 'jinse', 'source_id': source_id, 'title': title, 'content': content, 'url': 'http://www.jinse.com/lives/' + str(source_id) + '.htm', 'images': [], 'keywords': [], 'has_send': 0 } collection.insert(insert_data) return True
@author: maozhufeng @file: send_news_to_test @time: 2018/6/16 下午12:41 """ import pymongo import requests from common import conf from model.mongo import Mongo collection = Mongo().news news_to_send = collection.find({ 'has_keywords': 1, 'repeat': -1, 'title': { '$ne': '' }, 'content': { '$ne': '' } }) news_to_send = list(news_to_send) news_to_send = [ new for new in news_to_send if new['title'] is not None and new['content'] is not None ] all_count = len(news_to_send) send_count = 0 start = 0 while start < all_count - 1: end = start + 300 if start + 300 <= all_count else all_count send_count += (end - start)
def sync_news(self): collection = Mongo().news news_to_send = collection.find({ 'has_send': 0, 'has_keywords': 1, 'repeat': -1, 'title': {'$ne': ''}, 'content': {'$ne': ''} }).limit(30) if news_to_send.count() == 0: return True news_to_send = list(news_to_send) news_to_send = [new for new in news_to_send if new['title'] is not None and new['content'] is not None] post_data = [] for new in news_to_send: created_at = int(new['created_at']) if created_at < 10 ** 11: created_at *= 1000 post_data.append({ 'new_id': str(new['_id']), 'type': new['type'], 'author': new['author'], 'spider_from': new['spider_from'], 'source': new['source'], 'title': new['title'], 'content': new['content'], 'url': new['url'], 'created_at': new['created_at'], 'images': new['images'], 'keywords': new['keywords'], 'keywordstext': ' '.join(new['keywords']) if new['keywords'] else '', 'has_translated': str(new.get('has_translated', 0)), 'translated_text': new.get('translated_text', ''), 'translated_title': new.get('translated_title', '') }) result = None try: result = requests.post(conf['sync']['host'] + conf['sync']['news_update'], json={'batch_news': post_data}) delete_repeat_news() print(result) result = result.json() except Exception as e: self.retry(e) if result['error_code'] == 0: for new in news_to_send: new.update({ 'has_send': 1 }) collection.save(new) news_send_finish(news_to_send) # TODO test_environment try: test_result = requests.post('http://47.52.103.240:18189' + conf['sync']['news_update'], json={'batch_news': post_data}) print('send news to test environment') print(post_data) print(test_result.json()) print('log_news_id', [_['new_id'] for _ in post_data]) except: pass
def sync_token_twitter(): collection = Mongo().twitter twitters = collection.find({}) for twitter in twitters: sync_single_token_twitter.delay(twitter['token_id'], twitter['url'], twitter['followers_count'])