"""
抓取知乎所有话题列表
"""
import HTMLParser
import datetime

from pymongo import MongoClient
from xtls.basecrawler import BaseCrawler
from xtls.codehelper import trytry
from xtls.logger import get_logger
from xtls.timeparser import now, parse_time
from xtls.util import BeautifulSoup

from config import *

logger = get_logger(__file__)
ZHIHU_URL = 'http://www.zhihu.com'
TOPIC_URL = 'http://www.zhihu.com/topic/{tid}/top-answers?page='
MONGO = MongoClient(MONGO_HOST, MONGO_PORT)


class TopicHotCrawler(BaseCrawler):
    def __init__(self, topic_id):
        super(TopicHotCrawler, self).__init__(topic_id=topic_id)

    @classmethod
    def save(cls, data):
        MONGO[DB][TOPIC_COLL].update_one({'_id': data['_id']}, {'$set': data}, upsert=True)

    @classmethod
    def unescape(cls, string):
Пример #2
0
#!/usr/bin/env python
# encoding=utf-8

import sys

from tornado.httpserver import HTTPServer
from tornado.ioloop import IOLoop
from tornado.wsgi import WSGIContainer
from werkzeug.contrib.fixers import ProxyFix
from xtls.logger import get_logger

from app import create_app

app = create_app()
app.wsgi_app = ProxyFix(app.wsgi_app)
logger = get_logger(__file__)

if __name__ == '__main__':
    host = '0.0.0.0'
    port = 1994 if len(sys.argv) < 2 else int(sys.argv[1])
    http_server = HTTPServer(WSGIContainer(app))
    http_server.listen(port=port, address=host)
    logger.info('lawyer asst server start at {}:{}'.format(host, port))
    IOLoop.instance().start()