示例#1
0
def find_repeat_news():
    """
    repeat 1 重复 -1 不重复
    :return:
    """
    collection = Mongo().news
    news = list(collection.find({'created_at': {'$gt': time.time() - 3600}}))
    for new in news:
        new['keywords_temp'] = get_keywords(new['content'])
    for new1 in news:
        if new1.get('repeat'):
            new1['state'] = 1
            continue
        for new2 in news:
            if new2['_id'] == new1['_id']:
                continue
            if is_sim(new1, new2):
                new1['repeat'] = 1
                new1['state'] = 1
                break
        if new1.get('state') != 1:
            new1['state'] = 1
            new1['repeat'] = -1
    for new in news:
        new.pop('state')
        new.pop('keywords_temp')
        collection.save(new)
示例#2
0
def get_erc_transaction():
    collection = Mongo().token
    p = 1
    # 取前面150位
    while p <= 3:
        p += 1
        list_page = PyQuery(url='https://etherscan.io/tokens')
        tokens = list_page('tbody')('tr').items()
        for token in tokens:
            token_name = token('h5')('a').text()
            token_name = re.findall(r'\w+', token_name)
            token_name = token_name[-1].lower()
            href = 'https://etherscan.io' + token('h5')('a').attr('href')
            contract_address = href.split('/')[-1]
            if token_name in key_words:
                try:
                    transaction = get_single_erc_transaction(contract_address)
                    db_result = collection.find_one({'token_name': token_name})
                    if db_result:
                        db_result.update({
                            'transaction': transaction
                        })
                        collection.save(db_result)
                    else:
                        collection.insert({
                            'token_name': token_name,
                            'transaction': transaction
                        })
                except:
                    print(contract_address)
示例#3
0
 def get_data(token_name, url, api_url):
     collection = Mongo().github
     result = requests.get('{}?client_id={}&client_secret={}'.format(api_url, 'dcc3734066251548c999',
                                                                     '89d90ad41f32b18d2ed689cb21875b75e88a2d82')).json()
     if 'forks_count' not in result:
         # TODO record error result
         return
     token = collection.find_one({
         'token_name': token_name,
         'github_url': url
     })
     insert_data = {
         'token_name': token_name,
         'github_url': url,
         'star': result['stargazers_count'],
         'fork': result['forks_count'],
         'watch': result['subscribers_count'],
         'spider_time': time.time(),
         'update_time': result['updated_at'],
         'create_time': result['created_at']
     }
     if token:
         token.update(insert_data)
         collection.save(token)
     else:
         collection.insert(insert_data)
示例#4
0
def get_google_trend(key, token_id):
    # socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 1086)
    # temp_socket = socket.socket
    # socket.socket = socks.socksocket
    token, search_time = get_google_token(key)
    headers = {
        'host': 'trends.google.com',
        'User_Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
        'Referfer': ('https://trends.google.com/trends/explore?q=' + key).encode('utf-8'),
        'x-client-data': 'CJa2yQEIo7bJAQjBtskBCKmdygEIqKPKAQ=='
    }
    request_url = 'https://trends.google.com/trends/api/widgetdata/multiline?hl=zh-CN&tz=-480&req=%7B%22time%22:%22{}%22,%22resolution%22:%22DAY%22,%22locale%22:%22zh-CN%22,%22comparisonItem%22:%5B%7B%22geo%22:%7B%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22{}%22%7D%5D%7D%7D%5D,%22requestOptions%22:%7B%22property%22:%22%22,%22backend%22:%22IZG%22,%22category%22:0%7D%7D&token={}&tz=-480'.format(
        search_time, key, token)
    result = requests.get(request_url, headers=headers).text[5:]
    result = json.loads(result)
    data = result['default']['timelineData']
    # socket.socket = temp_socket
    collection = Mongo().google_trends
    db_result = collection.find_one({
        'token_id': token_id
    })
    if db_result:
        db_result.update({
            'trends': data
        })
        collection.save(db_result)
        return True
    collection.insert({
        'token_id': token_id,
        'token_name': key,
        'trends': data
    })
    return True
示例#5
0
文件: parse.py 项目: Fern9/newsSpider
def deal_content():
    collection = Mongo().news
    news = collection.find({'has_keywords': {'$ne': 1}})
    for new in news:
        if not new['title'] or not new['content']:
            continue
        text = new['title'] + ';' + new['content']
        keywords = get_keywords(text)
        new.update({'keywords': keywords, 'has_keywords': 1})
        collection.save(new)
示例#6
0
def news_send_finish(self, news):
    try:
        collection = Mongo().news
        for new in news:
            new.update({
                'has_send': 1
            })
            collection.save(new)
    except:
        self.retry()
示例#7
0
def get_user_info(token_name, username, token_id):
    try:
        collection = Mongo().twitter
        result = api.get_user(screen_name=username)
        result._json['token_name'] = token_name
        result._json['user_name'] = username
        result._json['token_id'] = token_id
        token = collection.find_one({
            "token_id": token_id,
            "user_name": username
        })
        if token:
            token.update(result._json)
            collection.save(token)
        else:
            collection.insert(result._json)
    except TweepError:
        pass
示例#8
0
def get_transaction():
    collection = Mongo().token
    dom = PyQuery(url='http://www.blocktivity.info/')
    lists = dom('.font_size_row').items()
    for _ in lists:
        token_name = _('td').eq(2)('a').text().lower()
        transaction = _('td').eq(3).text()
        transaction = list(filter(str.isdigit, transaction))
        transaction = int(''.join(map(str, transaction)))
        db_result = collection.find_one({'token_name': token_name})
        if db_result:
            db_result.update({
                'transaction': transaction
            })
            collection.save(db_result)
        else:
            collection.insert({
                'token_name': token_name,
                'transaction': transaction
            })
    get_erc_transaction()
示例#9
0
def statistic_tokens_address():
    collection = Mongo().token
    tokens = get_tokens()
    for token in tokens:
        token_name = token['ticker'].lower()
        code, address, increase = statistic_token_address(token_name)
        if not code:
            address = 0
            increase = 0
        db_result = collection.find_one({'token_name': token_name})
        if db_result:
            db_result.update({
                'address': address,
                'address_increase': increase
            })
            collection.save(db_result)
        else:
            collection.insert({
                'token_name': token_name,
                'address': address,
                'address_increase': increase
            })
示例#10
0
def sync_news(self):
    collection = Mongo().news
    news_to_send = collection.find({
        'has_send': 0,
        'has_keywords': 1,
        'repeat': -1,
        'title': {'$ne': ''},
        'content': {'$ne': ''}
    }).limit(30)
    if news_to_send.count() == 0:
        return True
    news_to_send = list(news_to_send)
    news_to_send = [new for new in news_to_send if new['title'] is not None and new['content'] is not None]
    post_data = []
    for new in news_to_send:
        created_at = int(new['created_at'])
        if created_at < 10 ** 11:
            created_at *= 1000
        post_data.append({
            'new_id': str(new['_id']),
            'type': new['type'],
            'author': new['author'],
            'spider_from': new['spider_from'],
            'source': new['source'],
            'title': new['title'],
            'content': new['content'],
            'url': new['url'],
            'created_at': new['created_at'],
            'images': new['images'],
            'keywords': new['keywords'],
            'keywordstext': ' '.join(new['keywords']) if new['keywords'] else '',
            'has_translated': str(new.get('has_translated', 0)),
            'translated_text': new.get('translated_text', ''),
            'translated_title': new.get('translated_title', '')
        })
    result = None
    try:
        result = requests.post(conf['sync']['host'] + conf['sync']['news_update'],
                               json={'batch_news': post_data})
        delete_repeat_news()
        print(result)
        result = result.json()
    except Exception as e:
        self.retry(e)

    if result['error_code'] == 0:
        for new in news_to_send:
            new.update({
                'has_send': 1
            })
            collection.save(new)
        news_send_finish(news_to_send)

    # TODO test_environment
    try:
        test_result = requests.post('http://47.52.103.240:18189' + conf['sync']['news_update'],
                                    json={'batch_news': post_data})
        print('send news to test environment')
        print(post_data)
        print(test_result.json())
        print('log_news_id', [_['new_id'] for _ in post_data])
    except:
        pass