def main_task(): issue_dict_url = 'http://news.mingpao.com/dat/pns/issuelist.js?819181' r = requests.get(issue_dict_url) json_issue_dict = json.loads(r.text) src.spiders.spider_mingpao.SpiderMingPao.PUT_IN_STORAGE = True for i in range(util.get_offset_by_day_date('20040627'), util.get_offset_by_day_date('20010101')): day_str = util.get_day_string(offset=i) if '1 ' + day_str in json_issue_dict['PNS_WEB_TC']: issue_id = json_issue_dict['PNS_WEB_TC']['1 ' + day_str]['E'] news_list_url = 'http://news.mingpao.com/dat/pns/pns_web_tc/feed1/' + day_str + issue_id + '/content.js' mingpao_seed = set() r = requests.get(news_list_url) if re.findall(r'feed_module_2', r.text): news_list_data = news_list_data_pattern.findall(r.text)[0] json_obj = json.loads(news_list_data) for it in json_obj['rss']['channel']['item']: mingpao_seed.add( 'http://news.mingpao.com/dat/pns/pns_web_tc/article1/' + day_str + issue_id.lower() + '/todaycontent_' + str(it['ATTRIBUTES']['NODEID']) + '.js') mingpao_reg = { ur'http://news\.mingpao\.com/dat/pns/.*' + day_str + '.+' } spider_mingpao = src.spiders.spider_mingpao.SpiderMingPao( 'SpiderMingPao', mingpao_seed, mingpao_reg, THREAD_NUM=5) spider_mingpao.OFFSET = i spider_mingpao.logger_file = spider_mingpao.get_file_logger( 'mingpao_task_log', 'logs/mingpao_task.log') spider_mingpao.BATCH_NUMBER = util.get_day_stamp(i) + 10570 spider_mingpao.start() else: print 'KEY ERROR: ' + '"1 ' + day_str + '"'
def main_task(): govinfo.SpiderGovInfoNews.PUT_IN_STORAGE = True for i in range(util.get_offset_by_day_date('20160929'), util.get_offset_by_day_date('19980331')): day_str = util.get_day_string(offset=i) day_str = day_str[:-2] + '/' + day_str[-2:] govinfo_seed = {'http://www.info.gov.hk/gia/general/' + day_str + 'c.htm'} govinfo_reg = {ur'http://www\.info\.gov\.hk/gia/general/' + day_str + '.+'} spider_govinfo = govinfo.SpiderGovInfoNews('SpiderGovInfoNews', govinfo_seed, govinfo_reg, THREAD_NUM=10) spider_govinfo.OFFSET = i spider_govinfo.logger_file = spider_govinfo.get_file_logger('govinfo_task_log', 'logs/govinfo_task.log') spider_govinfo.BATCH_NUMBER = util.get_day_stamp(i) + 10600 spider_govinfo.start()
def main_task(): now_news.SpiderNow.PUT_IN_STORAGE = True for i in range(util.get_offset_by_day_date('20151111'), util.get_offset_by_day_date('20110430')): day_str = util.get_day_string(interval_str='-', offset=i) now_seed = {'https://news.now.com/home/past?date=' + day_str} now_reg = {ur'https://news\.now\.com/.+newsId=\d+.+'} spider_now = now_news.SpiderNow('SpiderCableNews', now_seed, now_reg, THREAD_NUM=10) spider_now.BATCH_NUMBER = util.get_day_stamp(offset=i) + 10280 spider_now.OFFSET = i spider_now.logger_file = spider_now.get_file_logger('nownews_task_log', 'logs/now_task.log') spider_now.start()
def main_task(): commercialradio.SpiderCommercialRadio.PUT_IN_STORAGE = True commercialradio.SpiderCommercialRadio.CRAWL_NEXT = False commercial_reg = {ur'http://www\.881903\.com/.+detail.*'} reg_pattern = re.compile(ur'http://www\.881903\.com/.+detail.*') for i in range(util.get_offset_by_day_date('20110605'), util.get_offset_by_day_date('20080101')): day_str = util.get_day_string(interval_str='-', style='american', offset=i) portal_url = 'http://www.881903.com/Page/ZH-TW/newssearch.aspx?sdate=' + day_str + '&edate=' + day_str + '&csid=261_0' commercial_seed = set() try: ie_driver = webdriver.Ie( 'C://Users/benwu/Desktop/IEDriverServer.exe') ie_driver.get(portal_url) d = pq(ie_driver.page_source) add_hrefs(commercial_seed, reg_pattern, ie_driver.page_source) if total_page_pattern.findall(d('td.Font_Article_CH').text()): total_page = int( total_page_pattern.findall( d('td.Font_Article_CH').text())[0]) for j in range(2, total_page + 1): # print 'page: ' + str(j) ie_driver.execute_script('StockSearchCallBack(' + str(j) + ');') load_done = False dd = pq(ie_driver.page_source) while not load_done: dd = pq(ie_driver.page_source) if dd('.Font_Article_CH span'): num = page_num_pattern.findall( dd('.Font_Article_CH span').text())[0] if num == str(j): load_done = True add_hrefs(commercial_seed, reg_pattern, ie_driver.page_source) # print len(commercial_seed) finally: ie_driver.close() spider_commercial = commercialradio.SpiderCommercialRadio( 'SpiderCommercialRadio', commercial_seed, commercial_reg, THREAD_NUM=10) spider_commercial.BATCH_NUMBER = util.get_day_stamp(offset=i) + 10260 spider_commercial.OFFSET = i spider_commercial.logger_file = spider_commercial.get_file_logger( 'commercial_task_log', 'logs/commercial_task.log') spider_commercial.start()
def main_task(): rthk.SpiderRTHK.PUT_IN_STORAGE = True for i in range(util.get_offset_by_day_date('20160717'), util.get_offset_by_day_date('20150927')): current_day_string = util.get_day_string(offset=i) day_string = 'archive_year=' + current_day_string[0:4] + '&archive_month=' + current_day_string[ 4:6] + '&archive_day=' + current_day_string[ 6:8] instant_news_page_url = 'http://news.rthk.hk/rthk/ch/news-archive.htm?' + day_string + '&archive_cat=all' rthk_seed = {instant_news_page_url} rthk_reg = {ur'http://news\.rthk\.hk/rthk/ch/component/.*' + util.get_day_string(offset=i) + '.*'} spider_rthk = rthk.SpiderRTHK('SpiderRTHK', rthk_seed, rthk_reg, THREAD_NUM=5) spider_rthk.BATCH_NUMBER = util.get_day_stamp() + 10130 spider_rthk.OFFSET = i spider_rthk.logger_file = spider_rthk.get_file_logger('rthk_task_log', 'logs/rthk_task.log') spider_rthk.start()
def main_task(): hket.SpiderHKET.PUT_IN_STORAGE = True hket_reg = {ur'http://.+\.hket\.com/article/\d+/.*'} for i in range(util.get_offset_by_day_date('20161010'), util.get_offset_by_day_date('20161006')): day_str = util.get_day_string(offset=i) portal_url = 'http://paper.hket.com/srap017/%E6%98%94%E6%97%A5%E6%96%B0%E8%81%9E?dis=' + day_str hket_seed = {portal_url} spider_hket = hket.SpiderHKET('SpiderHKET', hket_seed, hket_reg, THREAD_NUM=5, MAX_DEPTH=1) spider_hket.BATCH_NUMBER = util.get_day_stamp() + 10110 spider_hket.OFFSET = i spider_hket.logger_file = spider_hket.get_file_logger('hket_task_log', 'logs/hket_task.log') spider_hket.start()
def main_task(): src.spiders.spider_apple.SpiderApple.PUT_IN_STORAGE = True for i in range(util.get_offset_by_day_date('20070227'), util.get_offset_by_day_date('20020101')): day_str = util.get_day_string(offset=i) apple_seed = { 'http://hk.apple.nextmedia.com/archive/index/' + day_str + '/index/' } spider_apple = src.spiders.spider_apple.SpiderApple( 'SpiderApple', apple_seed, {ur'http://hk\.apple\.nextmedia\.com/.*' + day_str + '/.*'}, THREAD_NUM=5) spider_apple.BATCH_NUMBER = util.get_day_stamp(offset=i) + 10590 spider_apple.OFFSET = i spider_apple.logger_file = spider_apple.get_file_logger( 'apple_task_log', 'logs/apple_task.log') spider_apple.start()
def main_task(): cable_news.SpiderCableNews.PUT_IN_STORAGE = True for i in range(util.get_offset_by_day_date('20160724'), util.get_offset_by_day_date('20131231')): day_str = util.get_day_string(offset=i) first_url = get_news_page_url(day_str, 1) r = requests.get(first_url) d = pq(r.text) if total_page_pattern.findall(d('#1').text()): total_page = int(total_page_pattern.findall(d('#1').text())[0]) cablenews_seed = set() for j in range(total_page): cablenews_seed.add(get_news_page_url(day_str, j+1)) cablenews_reg = {ur'http://.+?\.i-cable\.com/.*videopage.*\d+/.*', ur'http://.+?\.i-cable\.com/.*VideoPage.*\d+/.*'} spider_cablenews = cable_news.SpiderCableNews('SpiderCableNews', cablenews_seed, cablenews_reg, THREAD_NUM=10) spider_cablenews.BATCH_NUMBER = util.get_day_stamp(offset=i) + 10220 spider_cablenews.OFFSET = i spider_cablenews.logger_file = spider_cablenews.get_file_logger('cablenews_task_log', 'logs/cablenews_task.log') spider_cablenews.start()
def main_task(): src.spiders.spider_hkej.SpiderHKEJ.PUT_IN_STORAGE = True src.spiders.spider_hkej.SpiderHKEJ.ADD_MEDIA = True for i in range(util.get_offset_by_day_date('20161025'), util.get_offset_by_day_date('20160901')): src.spiders.spider_hkej.SpiderHKEJ.start_crawling(offset=i, THREAD_NUM=20)