Exemplo n.º 1
0
def crawling():
    print('-----start crawling time: %s-----' % (datetime.today()))
    config = configparser.ConfigParser()
    config.read('..\\config.ini', 'utf-8')
    news_pool = get_news_pool()
    crawl_news(news_pool, 140, config['DEFAULT']['doc_dir_path'],
               config['DEFAULT']['doc_encoding'])
Exemplo n.º 2
0
def crawling():
	print('-----start crawling time: %s-----'%(datetime.today()))
	config = configparser.ConfigParser()
	config.read('../config.ini', 'utf-8')
	root = 'http://news.sohu.com/1/0903/61/subject212846158'
	max_page = get_max_page(root + '.shtml')
	news_pool = get_news_pool(root, max_page, max_page - 5)
	crawl_news(news_pool, 140, config['DEFAULT']['doc_dir_path'], config['DEFAULT']['doc_encoding'])
Exemplo n.º 3
0
def crawling():
    print('-----start crawling time: %s-----' % (datetime.today()))
    config = configparser.ConfigParser()
    config.read('../config.ini', 'utf-8')
    # 新闻的标题列表页:如综合要闻页:http://news.gpnu.edu.cn/index/zhxw.htm
    root = 'http://news.gpnu.edu.cn/index/zhxw'
    # max_page = get_max_page(root + '.shtml')
    print('***', root + '.htm')
    max_page = get_max_page(root + '.htm')

    news_pool = get_news_pool(root, max_page, max_page - 5)
    print("=========这是分隔线=========")
    crawl_news(news_pool, 140, config['DEFAULT']['doc_dir_path'],
               config['DEFAULT']['doc_encoding'])