示例#1
0
class TweetFetcher():

    def __init__(self, consumer_key=config.consumer_key,
                 consumer_secret=config.consumer_secret,
                 access_token=config.access_token,
                 access_token_secret=config.access_token_secret):
        self.auth = OAuth1(consumer_key, consumer_secret, access_token,
                           access_token_secret)
        self.session = requests.Session()
        self.db_interface = MongoDao()

    def fetch_by_users(self, twitter_handles):
        params = {'track' : ",".join(['@' + h for h in twitter_handles])}
        stream = self.session.post(url=config.api_url, auth=self.auth,
                                   data=params, stream=True)
        for line in stream.iter_lines():
            if line:
                print 'record found'
                post = json.loads(line)
                post.update(self._compute_sentiment(post['text']))
                self.db_interface.insert(post)
                print 'inserted record from {}'.format(post['user']['screen_name'])

    def _compute_sentiment(self, text):
        return vader(text.encode('utf8'))
async def start():
    """
    启动爬虫

    :return:
    """
    logging.info("启动爬虫,更新小说 ...")

    # 从数据库中检索所有小说
    novels = []
    with MongoDao() as dao:
        # 从数据库中查询所有小说列表
        novels += novel_iterator(dao)

    if not novels:
        return

    async with aiohttp.ClientSession() as session:

        # 遍历小说,依次更新之
        for novel in novels:
            logging.debug("正在爬取更新 《%s》 ", novel["name"])
            # 遍历小说列表,依次检查其是否有更新
            # {'number': '20380548', 'original_url': 'https://www.biquge5200.cc/52_52542/20380548.html'}
            novel_id = str(novel["_id"])
            # 获取最新章节,以源url作为更新标志位
            with MongoDao() as dao:
                chapter = dao.get_latest_chapter(novel_id)

            # 如有更新,则将更新部分爬取下来,入库
            new_urls = await chapter_urls(session, novel["origin_url"], chapter and chapter["origin_url"])
            for new_url in new_urls:
                await update_chapter(session, new_url[0], new_url[1], novel_id)

    logging.info("停止爬虫,更新结束 ...")
async def update_chapter(aiohttp_session, number, chapter_origin_url, novel_id):
    """
    更新章节信息

    :param aiohttp_session:
    :param number:
    :param chapter_origin_url:
    :param novel_id:
    :return:
    """
    try:
        html = await download(aiohttp_session, chapter_origin_url)
        if html:
            logging.info("更新章节:{}".format(chapter_origin_url))
            # ($title, $content, $url)
            results = await parse_chapter(html, chapter_origin_url)
            if results:
                with MongoDao() as dao:
                    dao.save_chapter(novel_id,
                                     {"number": number,
                                      "title": results[0],
                                      "content": results[1],
                                      "origin_url": results[2]})
    except BaseException:
        logging.error("更新章节[{}]出错!".format(chapter_origin_url))
示例#4
0
 def __init__(self, consumer_key=config.consumer_key,
              consumer_secret=config.consumer_secret,
              access_token=config.access_token,
              access_token_secret=config.access_token_secret):
     self.auth = OAuth1(consumer_key, consumer_secret, access_token,
                        access_token_secret)
     self.session = requests.Session()
     self.db_interface = MongoDao()
示例#5
0
def query_novel(novel_id):
    with MongoDao() as dao:
        # 查询小说信息
        data = dao.get_novel(novel_id)

    return render_template('catalog.html',
                           novel=utils.convert_id_string(data["novel"]),
                           chapters=map(utils.convert_id_string,
                                        data["chapters"]))
示例#6
0
def query_novels():
    """
    小说列表

    :return:
    """

    with MongoDao() as dao:
        novels = dao.list_novel()

    return render_template('index.html', novels=novels)
示例#7
0
def query_chapter(novel_id, chapter_id):
    with MongoDao() as dao:
        result = dao.get_chapter(novel_id, chapter_id)
        if not result:
            abort(404)
            return

    chapter = result["curr_chapter"]
    # 由于是网页显示用,所以把换行转换为换行标签
    chapter["content"] = chapter["content"].replace("\n", "<br>")
    return render_template(
        'chapter.html',
        chapter=chapter,
        novel_id=novel_id,
        prev_id=result["prev_chapter"] and str(result["prev_chapter"]["_id"])
        or None,
        next_id=result["next_chapter"] and str(result["next_chapter"]["_id"])
        or None)
示例#8
0
import logging, sys
import newspaper
import simplejson as json

from util import LoggerConfig
from newspaper import Config, Source
from dao import MongoDao
from concurrent.futures import ThreadPoolExecutor, wait, as_completed

logging.basicConfig(**LoggerConfig.logger_config)
logger = logging.getLogger(__name__)

newsDao = MongoDao.NewsArticleDao()


class NewsCrawlerConfig(object):
    SITE_FILE = "config/news_sites.json"
    CRAWL_FILE = "config/crawl_options.json"

    class NewsSite(object):
        def __init__(self, name, url, crawl_threads):
            self.name = name
            self.url = url
            self.crawl_threads = crawl_threads

    def __init__(self):
        self.sites = []
        self.crawl_option = Config()
        self.is_config_read = False

    def as_newscrawler(self, site_obj):