def execute_info_7x24(): crawler.info('The info_7x24 task is starting...') for au_message724_field in parse_info_7x24_list(): app.send_task('tasks.info_7x24.save_info_7x24_list', args=(au_message724_field, ), queue='info_7x24', routing_key='for_info_7x24')
def save_info_7x24_list(au_message724_field): crawler.info('The info_7x24 list task is starting...') AuMessage724.add(au_message724_field) app.send_task('tasks.info_7x24.save_info_7x24_content', args=(au_message724_field['ArticleID'], ), queue='info_7x24', routing_key='for_info_7x24')
def execute_repost_task(): # regard current weibo url as the original url, you can also analyse from the root url weibo_datas = WbDataOper.get_weibo_repost_not_crawled() crawler.info('There are {} repost urls have to be crawled'.format(len(weibo_datas))) for weibo_data in weibo_datas: app.send_task('tasks.repost.crawl_repost_page', args=(weibo_data.weibo_id, weibo_data.uid), queue='repost_crawler', routing_key='repost_info')
def execute_repost_task(): # regard current weibo url as the original url, you can also analyse from the root url weibo_datas = WbDataOper.get_weibo_repost_not_crawled() crawler.info('There are {} repost urls have to be crawled'.format( len(weibo_datas))) for weibo_data in weibo_datas: crawl_repost_page(weibo_data.weibo_id, weibo_data.uid)
def execute_market(): crawler.info('The market task is starting...') for b_market_field in parse_market_list(): app.send_task('tasks.market.save_market', args=(b_market_field,), queue='market', routing_key='for_market')
def save_trade_info_list(a_information_field): crawler.info('The trade info list task is starting...') BDxtInformation.add(a_information_field) app.send_task('tasks.trade_info.save_trade_info_content', args=(a_information_field['information_id'],), queue='trade_info', routing_key='for_trade_info')
def execute_user_task(): seeds, is_exists = SeedUser.get_seed_names() if is_exists: for seed in seeds: crawler.info(f"send task crawl_user_info {seed.name}") app.send_task("tasks.user.crawl_user_info", args=(seed.name, )) else: crawler.info("find no user, abort")
def execute_info_trade(): crawler.info('The trade info task is starting...') for a_information_field in parse_info_trade_list(): app.send_task('tasks.trade_info.save_trade_info_list', args=(a_information_field,), queue='trade_info', routing_key='for_trade_info')
def execute_login_task(): infos = LoginInfoOper.get_login_info() # Clear all stacked login tasks before each time for login Cookies.check_login_task() crawler.info('The login task is starting...') for info in infos: app.send_task('tasks.login.login_task', args=(info.name, info.password), queue='login_queue', routing_key='for_login') time.sleep(10)
def get_page(url): crawler.info("the crawling url is {url}".format(url=url)) # proxies = get_proxy() time.sleep(REQUEST_INTERVAL) crawler.info(f"sleep {REQUEST_INTERVAL}") resp = requests_retry_session().get(url, headers=headers, timeout=REQUEST_TIMEOUT) return resp.text
def crawl_user_info(name): """抓取用户首页的信息 :param name: 用户名 :return: None """ if not name: return None crawler.info(f"received task crawl_user_info {name}") user, other_crawled = get_profile(name) if not other_crawled: crawler.info(f"send task crawl_follower_fans {user.name}") app.send_task("tasks.user.crawl_follower_fans", args=(user.name, ))
def crawl(mid): # gu = BiliUser(mid) crawl_code = -1 try: res = BiliUser.store(mid) if res: crawler.info('%s_%d' % (mid, crawl_code)) crawl_code = 0 except: crawl.info('%s_%d' % (mid, crawl_code)) crawl_code = -1 # time.sleep(1) return crawl_code
def execute_login_task(): # 获取所有的需要登录的weibo账号信息 infos = LoginInfoOper.get_login_info() # Clear all stacked login tasks before each time for login Cookies.check_login_task() crawler.info('The login task is starting...') for info in infos: # 对xx任务 发送参数args # 让这个任务启动 # queue参数:表示通过这个队列来路由通知任务 # 路由的key由参数routing_key 指定 app.send_task('tasks.login.login_task', args=(info.name, info.password), queue='login_queue', routing_key='for_login') time.sleep(10)
def search_keyword(keyword, keyword_id): cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < LIMIT: cur_url = URL.format(encode_keyword, cur_page) if cur_page == 1: search_page = get_page(cur_url, auth_level=1) else: search_page = get_page(cur_url, auth_level=2) if not search_page: crawler.warning( 'No result for keyword {}, the source page is {}'.format( keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # we need not crawl the same keyword in this turn for wb_data in search_list: rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id) if rs: crawler.info( 'Keyword {} has been crawled in this turn'.format(keyword)) return else: WbDataOper.add_one(wb_data) KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id) # send task for crawling user info app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid, ), queue='user_crawler', routing_key='for_user_info') if cur_page == 1: cur_page += 1 elif 'page next S_txt1 S_line1' in search_page: cur_page += 1 else: crawler.info( 'Keyword {} has been crawled in this turn'.format(keyword)) return
def search_keyword(keyword, keyword_id): crawler.info('We are searching keyword "{}"'.format(keyword)) cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < LIMIT: cur_url = URL.format(encode_keyword, cur_page) # current only for login, maybe later crawling page one without login search_page = get_page(cur_url, auth_level=2) if not search_page: crawler.warning('No search result for keyword {}, the source page is {}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) # Because the search results are sorted by time, if any result has been stored in mysql, # We need not crawl the same keyword in this turn for wb_data in search_list: rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id) KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id) if rs: crawler.info('Weibo {} has been crawled, skip it.'.format(wb_data.weibo_id)) continue else: WbDataOper.add_one(wb_data) app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info') if cur_page == 1: cur_page += 1 elif 'noresult_tit' not in search_page: cur_page += 1 else: crawler.info('Keyword {} has been crawled in this turn'.format(keyword)) return
def save_info_7x24_content(article_id): crawler.info('The info_7x24 content task is starting...') crawler.info('The info_7x24 content article id is :{}'.format(article_id)) res_div = parse_content(article_id) crawler.info('The info_7x24 content is :{}...'.format(res_div[:20])) AuMessage724.objects(ArticleID=article_id).update(ArticleContent=res_div)
def save_trade_info_content(information_id): crawler.info('The trade info content list task is starting...') crawler.info('The trade info content info id is :{}'.format(information_id)) res_div = parse_content(information_id) crawler.info('The trade info content is :{}...'.format(res_div[:20])) BDxtInformation.objects(information_id=information_id).update( content=res_div)
def search_items_v2(keyword, keyword_id, date_item): search_time_list = [ "{}-{}:{}-{}".format(d, t, d, t + 2) for d, t in itertools.product([date_item], TIME_LIIT) ] for s_time in search_time_list: crawler.info('We are searching keyword "{}", {}'.format( keyword, s_time)) cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < LIMIT: cur_url = MAX_URL.format(encode_keyword, cur_page, s_time) # current only for login, maybe later crawling page one without login search_page = get_page(cur_url, auth_level=1, need_proxy=True) if "您可以尝试更换关键词,再次搜索" in search_page: break if not search_page: crawler.warning( 'No search result for keyword {}, the source page is {}'. format(keyword, search_page)) cur_page += 1 continue # return search_list = parse_search.get_search_info(search_page) if cur_page == 1: cur_page += 1 elif 'noresult_tit' not in search_page: cur_page += 1 else: crawler.info( 'Keyword {} has been crawled in this turn'.format(keyword)) return # Because the search results are sorted by time, if any result has been stored in mysql, # We don't need to crawl the same keyword in this turn for wb_data in search_list: # print(wb_data) rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id) KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id) # todo incremental crawling using time if rs: crawler.info('Weibo {} has been crawled, skip it.'.format( wb_data.weibo_id)) continue else: WbDataOper.add_one(wb_data) # todo: only add seed ids and remove this task app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid, ), queue='user_crawler', routing_key='for_user_info')
def search_keyword_topic(keyword, keyword_id, start_time='', end_time=''): crawler.info( 'We are crawling weibo topic content with keyword "{}"'.format( keyword)) cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < LIMIT: cur_url = URL.format(encode_keyword, start_time, end_time, cur_page) search_page = get_page(cur_url, auth_level=2) if not search_page: crawler.info( 'No such result for keyword {}, the source page is {}'.format( keyword, search_page)) return search_list = parse_topic.get_search_info(search_page) if cur_page == 1: cur_page += 1 elif '您可以尝试更换关键词' not in search_page: cur_page += 1 else: crawler.info( 'Keyword {} has been crawled in this turn'.format(keyword)) return for wb_data in search_list: rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id) KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id) if rs: crawler.info('Weibo {} has been crawled, skip it.'.format( wb_data.weibo_id)) continue else: WbDataOper.add_one(wb_data) app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid, ), queue='user_crawler', routing_key='for_user_info')
def search_keyword(keyword, keyword_id): crawler.info('We are searching keyword "{}"'.format(keyword)) cur_page = 1 encode_keyword = url_parse.quote(keyword) while cur_page < LIMIT: cur_url = URL.format(encode_keyword, cur_page) # current only for login, maybe later crawling page one without login search_page = get_page(cur_url, auth_level=2) if not search_page: crawler.warning('No search result for keyword {}, the source page is {}'.format(keyword, search_page)) return search_list = parse_search.get_search_info(search_page) if cur_page == 1: cur_page += 1 elif 'noresult_tit' not in search_page: cur_page += 1 else: crawler.info('Keyword {} has been crawled in this turn'.format(keyword)) return # Because the search results are sorted by time, if any result has been stored in mysql, # We don't need to crawl the same keyword in this turn for wb_data in search_list: rs = WbDataOper.get_wb_by_mid(wb_data.weibo_id) KeywordsDataOper.insert_keyword_wbid(keyword_id, wb_data.weibo_id) # todo incremental crawling using time if rs: crawler.info('Weibo {} has been crawled, skip it.'.format(wb_data.weibo_id)) continue else: WbDataOper.add_one(wb_data) # todo: only add seed ids and remove this task app.send_task('tasks.user.crawl_person_infos', args=(wb_data.uid,), queue='user_crawler', routing_key='for_user_info')
start_time = datetime.datetime.strptime(STARTTIME, "%Y-%m-%d %H:%M:%S") end_time = datetime.datetime.strptime(ENDTIME, "%Y-%m-%d %H:%M:%S") OneHour = datetime.timedelta(hours=1) time1 = start_time time2 = start_time + OneHour def get_topic_data(keyword, start_time='', end_time=''): from tasks.topic import search_keyword_topic search_keyword_topic(keyword, 33, start_time, end_time) while 1: while time1 < end_time: start_time_str = time1.strftime("%Y-%m-%d-%H") end_time_str = time2.strftime("%Y-%m-%d-%H") print(start_time_str, '--', end_time_str) crawler.info( "we are crawling keyword:{}, timerange {}:{} content".format( KEYWORD, start_time_str, end_time_str)) get_topic_data(KEYWORD, start_time_str, end_time_str) time1 = time1 + OneHour time2 = time2 + OneHour time1 = start_time time2 = start_time + OneHour
def get_page(url, auth_level=2, is_ajax=False, need_proxy=False): """ :param url: url to crawl :param auth_level: 0 stands for need nothing,1 stands for no login but need cookies,2 stands for need login. :param is_ajax: whether the request is ajax :param need_proxy: whether the request need a http/https proxy :return: response text, when a exception is raised, return '' """ crawler.info('the crawling url is {url}'.format(url=url)) count = 0 while count < MAX_RETRIES: if auth_level == 2: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning( 'No cookie in cookies pool. Maybe all accounts are banned, or all cookies are expired' ) send_email() os.kill(os.getppid(), signal.SIGTERM) # There is no difference between http and https address. proxy = { 'http': name_cookies[2], 'https': name_cookies[2], } else: proxy = getip.getIPWithoutLogin('') # if proxy['http'] is None: # crawler.warning('No available ip in ip pools. Using local ip instead.') try: if auth_level == 2: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=TIME_OUT, verify=False, proxies=proxy) elif auth_level == 1: resp = requests.get(url, headers=headers, cookies=COOKIES, timeout=TIME_OUT, verify=False, proxies=proxy) else: resp = requests.get(url, headers=headers, timeout=TIME_OUT, verify=False, proxies=proxy) except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning( 'Excepitons are raised when crawling {}.Here are details:{}'. format(url, e)) count += 1 time.sleep(EXCP_INTERAL) continue if resp.status_code == 414: crawler.warning('This ip has been blocked by weibo system') if not need_proxy: send_email() os.kill(os.getppid(), signal.SIGTERM) if resp.text: page = resp.text.encode('utf-8', 'ignore').decode('utf-8') else: count += 1 continue if auth_level == 2: # slow down to aviod being banned time.sleep(INTERAL) if is_banned(resp.url) or is_403(page): crawler.warning('Account {} has been banned'.format( name_cookies[0])) LoginInfoOper.freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_ajax and not is_complete(page): count += 1 continue if is_404(page): crawler.warning('{} seems to be 404'.format(url)) return '' Urls.store_crawl_url(url, 1) return page Urls.store_crawl_url(url, 0) return ''
# -*- coding: utf8 -*- """ test logger """ import sys import os.path sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from logger import crawler, storage crawler.info('crawler') storage.info('database connect error')
def get_page(url, auth_level=2, is_ajax=False, need_proxy=False): """ :param url: url to crawl :param auth_level: 0 stands for need nothing,1 stands for no login but need cookies,2 stands for need login. :param is_ajax: whether the request is ajax :param need_proxy: whether the request need a http/https proxy :return: response text, when a exception is raised, return '' """ crawler.info('the crawling url is {url}'.format(url=url)) count = 0 while count < MAX_RETRIES: if auth_level == 2: name_cookies = Cookies.fetch_cookies() if name_cookies is None: crawler.warning('No cookie in cookies pool. Maybe all accounts are banned, or all cookies are expired') send_email() os.kill(os.getppid(), signal.SIGTERM) # There is no difference between http and https address. proxy = {'http': name_cookies[2], 'https': name_cookies[2], } else: proxy = getip.getIPWithoutLogin('') # if proxy['http'] is None: # crawler.warning('No available ip in ip pools. Using local ip instead.') try: if auth_level == 2: resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=TIME_OUT, verify=False, proxies=proxy) elif auth_level == 1: resp = requests.get(url, headers=headers, cookies=COOKIES, timeout=TIME_OUT, verify=False, proxies=proxy) else: resp = requests.get(url, headers=headers, timeout=TIME_OUT, verify=False, proxies=proxy) except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e: crawler.warning('Excepitons are raised when crawling {}.Here are details:{}'.format(url, e)) count += 1 time.sleep(EXCP_INTERAL) continue if resp.status_code == 414: crawler.warning('This ip has been blocked by weibo system') if not need_proxy: send_email() os.kill(os.getppid(), signal.SIGTERM) if resp.text: page = resp.text.encode('utf-8', 'ignore').decode('utf-8') else: count += 1 continue if auth_level == 2: # slow down to aviod being banned time.sleep(INTERAL) if is_banned(resp.url) or is_403(page): crawler.warning('Account {} has been banned'.format(name_cookies[0])) LoginInfoOper.freeze_account(name_cookies[0], 0) Cookies.delete_cookies(name_cookies[0]) count += 1 continue if not is_ajax and not is_complete(page): count += 1 continue if is_404(page): crawler.warning('{} seems to be 404'.format(url)) return '' Urls.store_crawl_url(url, 1) return page Urls.store_crawl_url(url, 0) return ''
def execute_hot_list_task(): crawler.info(f"send task hot_list") hot_list_title = ["total", "science", "digital", "sport", "fashion", "film"] for title in hot_list_title: app.send_task("tasks.hot_list.crawl_hot_list", args=(title,))
def crawl_hot_list(title): if not title: return None crawler.info(f"received task crawl_hot_list {title}") get_hot_list(title)
def save_market(b_market_field): crawler.info('The save market task is starting...') BDxtMarketHouse.add(b_market_field)