""" 抓取知乎所有话题列表 """ import HTMLParser import datetime from pymongo import MongoClient from xtls.basecrawler import BaseCrawler from xtls.codehelper import trytry from xtls.logger import get_logger from xtls.timeparser import now, parse_time from xtls.util import BeautifulSoup from config import * logger = get_logger(__file__) ZHIHU_URL = 'http://www.zhihu.com' TOPIC_URL = 'http://www.zhihu.com/topic/{tid}/top-answers?page=' MONGO = MongoClient(MONGO_HOST, MONGO_PORT) class TopicHotCrawler(BaseCrawler): def __init__(self, topic_id): super(TopicHotCrawler, self).__init__(topic_id=topic_id) @classmethod def save(cls, data): MONGO[DB][TOPIC_COLL].update_one({'_id': data['_id']}, {'$set': data}, upsert=True) @classmethod def unescape(cls, string):
#!/usr/bin/env python # encoding=utf-8 import sys from tornado.httpserver import HTTPServer from tornado.ioloop import IOLoop from tornado.wsgi import WSGIContainer from werkzeug.contrib.fixers import ProxyFix from xtls.logger import get_logger from app import create_app app = create_app() app.wsgi_app = ProxyFix(app.wsgi_app) logger = get_logger(__file__) if __name__ == '__main__': host = '0.0.0.0' port = 1994 if len(sys.argv) < 2 else int(sys.argv[1]) http_server = HTTPServer(WSGIContainer(app)) http_server.listen(port=port, address=host) logger.info('lawyer asst server start at {}:{}'.format(host, port)) IOLoop.instance().start()