예제 #1
0
파일: repeat.py 프로젝트: Fern9/newsSpider
def find_repeat_news():
    """
    repeat 1 重复 -1 不重复
    :return:
    """
    collection = Mongo().news
    news = list(collection.find({'created_at': {'$gt': time.time() - 3600}}))
    for new in news:
        new['keywords_temp'] = get_keywords(new['content'])
    for new1 in news:
        if new1.get('repeat'):
            new1['state'] = 1
            continue
        for new2 in news:
            if new2['_id'] == new1['_id']:
                continue
            if is_sim(new1, new2):
                new1['repeat'] = 1
                new1['state'] = 1
                break
        if new1.get('state') != 1:
            new1['state'] = 1
            new1['repeat'] = -1
    for new in news:
        new.pop('state')
        new.pop('keywords_temp')
        collection.save(new)
예제 #2
0
def get_github_data(token_name):
    collection = Mongo().github
    try:
        result = collection.find({'token_name': token_name}).sort('spider_time', pymongo.DESCENDING).limit(1)[0]
        result.pop('_id')
        return return_success(data=result)
    except Exception as e:
        return return_error()
예제 #3
0
def statistic_token_address(token_name):
    collection = Mongo().token_address
    current_info = collection.find({
        'token_name': token_name
    }).sort('time', pymongo.DESCENDING).limit(1)
    if current_info.count() == 0:
        return False, False, False
    current_info = current_info[0]
    last_info = collection.find({
        'time': {'$gt': current_info['time'] - 86400},
        'token_name': token_name
    }).sort('time', pymongo.ASCENDING).limit(1)[0]
    if current_info['address'] - last_info['address'] == 0:
        last_info = collection.find({
            'time': {'$lte': current_info['time'] - 86400},
            'token_name': token_name
        }).sort('time', pymongo.DESCENDING).limit(1)[0]
    return True, current_info['address'], current_info['address'] - last_info['address']
예제 #4
0
def start_spider(self):
    result = None
    try:
        result = requests.get(conf['news']['bishijie']).json()
        collection = Mongo().news
        if result['error'] != 0:
            self.retry()
        result = result['data']
        for date in result:
            id_list = [new['newsflash_id'] for new in result[date]['buttom']]
            db_news = collection.find({
                'spider_from': 'bishijie',
                'source_id': {
                    '$in': id_list
                }
            })
            db_id_list = [new['source_id'] for new in db_news]
            for new in result[date]['buttom']:
                if new['newsflash_id'] in db_id_list:
                    continue
                content = new['content']
                try:
                    front_title_index = content.index('【')
                    tail_title_index = content.index('】')
                    title = content[front_title_index + 1:tail_title_index]
                    content = content[tail_title_index + 1:]
                except Exception as e:
                    title = ''
                insert_data = {
                    'type':
                    'news',
                    'created_at':
                    new['issue_time'],
                    'author':
                    new['source'],
                    'spider_from':
                    'bishijie',
                    'source':
                    'bishijie',
                    'source_id':
                    new['newsflash_id'],
                    'title':
                    title,
                    'content':
                    content,
                    'url':
                    'http://www.bishijie.com/home/newsflashpc/detail?id=' +
                    str(new['newsflash_id']),
                    'images': [],
                    'keywords': [],
                    'has_send':
                    0
                }
                collection.insert(insert_data)
    except Exception as e:
        self.retry(e)
예제 #5
0
파일: parse.py 프로젝트: Fern9/newsSpider
def deal_content():
    collection = Mongo().news
    news = collection.find({'has_keywords': {'$ne': 1}})
    for new in news:
        if not new['title'] or not new['content']:
            continue
        text = new['title'] + ';' + new['content']
        keywords = get_keywords(text)
        new.update({'keywords': keywords, 'has_keywords': 1})
        collection.save(new)
예제 #6
0
def sync_google_trends():
    collection = Mongo().google_trends
    trends = collection.find({})
    for trend in trends:
        if 'trends' not in trend or not trend['trends']:
            continue
        post_data = {
            'token_id': trend['token_id'],
            'search_number': trend['trends'][-1]['value'][0]
        }
        send_google_trend.delay(post_data)
예제 #7
0
def start_spider():
    collection = Mongo().news
    # html = requests.get('http://www.jinse.com/lives').text
    dom = PyQuery(url='https://wallstreetcn.com/live/blockchain')
    pane = dom(".wscn-tab-pane")
    items = pane.items()
    next(items)
    pane = next(items)
    lives = pane('.live-item')
    for li in lives.items():
        source_id = None
        content = li('.live-item__main__content')('p').text()
        if not content:
            continue
        content_more = li('.live-item__main__content-more')('p').html()
        try:
            front_title_index = content.index('【')
            tail_title_index = content.index('】')
            title = content[front_title_index + 1: tail_title_index]
            content = content[tail_title_index + 1:]
        except Exception as e:
            title = ''
        if content_more:
            content += content_more

        images = []
        images_items = li('.live-item__main__images')('.zoomer__img')
        for image in images_items.items():
            images.append(image.attr('src'))
        # 查询记录是否已经存在
        db_count = collection.find({
            'spider_from': 'wallstreetcn',
            'content': content
        }).count()
        if db_count > 0:
            continue

        insert_data = {
            'type': 'news',
            'created_at': int(time.time()),
            'author': "华尔街见闻",
            'spider_from': 'wallstreetcn',
            'source': 'wallstreetcn',
            'source_id': -1,
            'title': title,
            'content': content,
            'url': '',
            'images': [],
            'keywords': [],
            'has_send': 0
        }
        collection.insert(insert_data)
    return True
예제 #8
0
def cryptopanic_spider():
    collection = Mongo().news
    news = get_cryptopanic()
    if not news:
        return False
    for new in news:
        source_id = new['pk']
        db_count = collection.find({
            'spider_from': 'cryptopanic',
            'source_id': source_id
        }).count()
        if db_count > 0:
            continue
        title, content = new.get('title'), html2text(new.get('body'))
        title_cn, content_cn = google_translate_list([title, content])
        insert_data = {
            'type': new['kind'],
            'created_at': int(time.time()),
            'author': new.get('domain'),
            'spider_from': 'cryptopanic',
            'source': new['source']['domain'],
            'source_id': source_id,
            'title': new.get('title'),
            'content': html2text(new.get('body')),
            'url': new.get('url'),
            'images': new.get('image'),
            'has_keywords': 0,
            'has_send': 0,
            'repeat': -1,
            'has_translated': 1,
            'translated_title': title_cn,
            'translated_text': content_cn
        }
        currencies = new.get('currencies')
        if currencies:
            for currencie in new['currencies']:
                insert_data.setdefault('keywords',
                                       []).append(currencie['code'])
            insert_data['has_keywords'] = 1
        collection.insert(insert_data)
예제 #9
0
파일: sprider.py 프로젝트: Fern9/newsSpider
def start_spider():
    collection = Mongo().news
    data = requests.get(
        'https://api.jinse.com/v4/live/list?limit=20&reading=false')
    for date in data.json()['list']:
        for new in date['lives']:
            source_id = new['id']
            content = new['content']

            # 查询记录是否已经存在
            db_count = collection.find({
                'spider_from': 'jinse',
                'source_id': source_id
            }).count()
            if db_count > 0:
                continue
            try:
                front_title_index = content.index('【')
                tail_title_index = content.index('】')
                title = content[front_title_index + 1:tail_title_index]
                content = content[tail_title_index + 1:]
            except Exception as e:
                title = ''
            insert_data = {
                'type': 'news',
                'created_at': int(time.time()),
                'author': "金色快讯",
                'spider_from': 'jinse',
                'source': 'jinse',
                'source_id': source_id,
                'title': title,
                'content': content,
                'url': 'http://www.jinse.com/lives/' + str(source_id) + '.htm',
                'images': [],
                'keywords': [],
                'has_send': 0
            }
            collection.insert(insert_data)
    return True
예제 #10
0
@author: maozhufeng
@file: send_news_to_test
@time: 2018/6/16 下午12:41
"""
import pymongo
import requests

from common import conf
from model.mongo import Mongo

collection = Mongo().news
news_to_send = collection.find({
    'has_keywords': 1,
    'repeat': -1,
    'title': {
        '$ne': ''
    },
    'content': {
        '$ne': ''
    }
})
news_to_send = list(news_to_send)
news_to_send = [
    new for new in news_to_send
    if new['title'] is not None and new['content'] is not None
]
all_count = len(news_to_send)
send_count = 0
start = 0
while start < all_count - 1:
    end = start + 300 if start + 300 <= all_count else all_count
    send_count += (end - start)
예제 #11
0
def sync_news(self):
    collection = Mongo().news
    news_to_send = collection.find({
        'has_send': 0,
        'has_keywords': 1,
        'repeat': -1,
        'title': {'$ne': ''},
        'content': {'$ne': ''}
    }).limit(30)
    if news_to_send.count() == 0:
        return True
    news_to_send = list(news_to_send)
    news_to_send = [new for new in news_to_send if new['title'] is not None and new['content'] is not None]
    post_data = []
    for new in news_to_send:
        created_at = int(new['created_at'])
        if created_at < 10 ** 11:
            created_at *= 1000
        post_data.append({
            'new_id': str(new['_id']),
            'type': new['type'],
            'author': new['author'],
            'spider_from': new['spider_from'],
            'source': new['source'],
            'title': new['title'],
            'content': new['content'],
            'url': new['url'],
            'created_at': new['created_at'],
            'images': new['images'],
            'keywords': new['keywords'],
            'keywordstext': ' '.join(new['keywords']) if new['keywords'] else '',
            'has_translated': str(new.get('has_translated', 0)),
            'translated_text': new.get('translated_text', ''),
            'translated_title': new.get('translated_title', '')
        })
    result = None
    try:
        result = requests.post(conf['sync']['host'] + conf['sync']['news_update'],
                               json={'batch_news': post_data})
        delete_repeat_news()
        print(result)
        result = result.json()
    except Exception as e:
        self.retry(e)

    if result['error_code'] == 0:
        for new in news_to_send:
            new.update({
                'has_send': 1
            })
            collection.save(new)
        news_send_finish(news_to_send)

    # TODO test_environment
    try:
        test_result = requests.post('http://47.52.103.240:18189' + conf['sync']['news_update'],
                                    json={'batch_news': post_data})
        print('send news to test environment')
        print(post_data)
        print(test_result.json())
        print('log_news_id', [_['new_id'] for _ in post_data])
    except:
        pass
예제 #12
0
def sync_token_twitter():
    collection = Mongo().twitter
    twitters = collection.find({})
    for twitter in twitters:
        sync_single_token_twitter.delay(twitter['token_id'], twitter['url'], twitter['followers_count'])