def get_all_article(): article = [] nav_url = find_data(url_base) article_num = 0 article_list = [] #nav_url.append(url_base) parse_log.info('开始爬取当日所有文章链接....') for i in nav_url: #遍历导航栏链接 page_num, art_list = get_page(i['type_url']) if not page_num: #去除没有文章的分类 continue parse_log.debug('已获取文章链接{0}条'.format(page_num)) article_list.append({ 'type_name': i['type_name'], 'art_list': art_list }) upload_end = [] total = 0 for i in article_list: #upload_type = news_type[i['type_name']] waiting_upload = i['art_list'] #parse_log.debug( '开始上传,待上传{0}条'.format(len(waiting_upload)) total = total + len(waiting_upload) for a in waiting_upload: if a not in upload_end: article_title, artibody = handle_article(a) if not artibody or not article_title: continue article.append({ 'title': article_title, 'body': artibody, 'type': i['type_name'], 'url': a, 'source_url': url_base }) parse_log.debug(u'获取文章内容:{0} 栏目.....{1}/{2}........{3}'.format( i['type_name'], waiting_upload.index(a), len(waiting_upload), a, )) upload_end.append(a) else: parse_log.debug('已存在') continue parse_log.info('已获取文章{0}条'.format(len(article))) return article
def get_all_article(): article = [] nav_url = find_data(url_base) article_num = 0 article_list = [] # nav_url.append(url_base) parse_log.info('开始爬取当日所有文章链接....') for i in nav_url: #遍历导航栏链接 page_num, art_list = get_page(i['type_url'], i['type_name']) if not page_num: #去除没有文章的分类 continue article_list.extend(art_list) parse_log.info('已获取链接{0}条'.format(len(article_list))) cache = Cache_file() url_list = cache.read() total = 0 upload_end = [] def get_wenzhang(i): if i['wenzhang_url'] in url_list: return article_title, artibody = handle_article(i['wenzhang_url']) url_list.append(i['wenzhang_url']) if not artibody or not article_title: return return { 'title': article_title, 'body': artibody, 'type': i['type_name'], 'url': i['wenzhang_url'], 'source_url': url_base } pool = ThreadPool(4) results = pool.map(get_wenzhang, article_list) pool.close() pool.join() results2 = [] for i in results: if i is None: continue results2.append(i) cache.save(url_list) #parse_log.info('已获取文章{0}条'.format(len(results2))) return results2
def main(): upload_goto = [] #上传目标信息 article_list = [] try: xinlang = parse_xinlang.get_all_article() parse_log.info('获取新浪新闻{0}条'.format(len(xinlang))) article_list.extend(xinlang) except: pass try: huijinwang = parse_huijinwang.get_all_article() parse_log.info('获取汇金网新闻{0}条'.format(len(huijinwang))) article_list.extend(huijinwang) except: pass try: wangyicaijin = parse_wangyicaijin.get_all_article() parse_log.info('获取网易财经{0}条'.format(len(wangyicaijin))) article_list.extend(wangyicaijin) except: pass try: rong360 = spider_rong360.get_all_article() parse_log.info('获取rong360{0}条'.format(len(rong360))) article_list.extend(rong360) except: pass try: south_money = spider_south.get_all_article() parse_log.info('获取南方财富网{0}条'.format(len(south_money))) article_list.extend(south_money) except: pass try: jingjiwang = spider_jingjiwang.get_all_article() parse_log.info('获取中国经济网{0}条'.format(len(jingjiwang))) article_list.extend(jingjiwang) except: pass parse_log.info('获取文章共{0}条,开始存入数据库'.format(len(article_list))) for i in article_list: time_now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') d = { '_id': i['title'], 'title': i['title'], 'body': str(i['body']), 'type': i['type'], 'url': i['url'], 'source_url': i['source_url'], 'date': today_date, 'create_time': time_now } insert_data(d, 'news_data') today_article = find_data({'date': today_date}, 'news_data') today_article = [i for i in today_article] random.shuffle(today_article) wait_upload = [] for i in today_article: wait_upload.append(i) parse_log.info('今日待上传{0}条'.format(len(wait_upload))) for i in upload_goto: wait_upload = wait_upload[:i['upload_max']] total = len(wait_upload) for a in wait_upload: upload(a, i) parse_log.info('上传完成{0}'.format(total))