def get_auto_configured_spider(cls, offset=0): cats = dict() cats['105'] = '海外綜合' cats['102'] = '新聞專輯' singtao_seed = {'http://std.stheadline.com/'} util.add_hrefs('http://std.stheadline.com/', selectors={'#navbar ul.nav li.has-children > a'}, seeds=singtao_seed) ''' id_pattern = re.compile(ur'(?<=php\?cat=)\d+') for cat in d('ul.sub-menu a').items(): if re.findall(id_pattern, cat.attr('href')): cats[re.findall(id_pattern, cat.attr('href'))[0]] = cat.text() if instant_cat_pattern.match(cat.attr('href')): singtao_seed.add(cat.attr('href')) for k, v in cats.iteritems(): singtao_seed.add('http://std.stheadline.com/daily/section-list.php?cat=' + k) ''' spider_singtao = SpiderSingTao( 'SpiderSingTao', singtao_seed, { ur'(http://std\.stheadline\.com/daily/news-content\.php.*)|(http://std\.stheadline\.com/instant/articles/detail/\d+.*)' }, THREAD_NUM=1, cats=cats) spider_singtao.FETCH_DELAY = 0.5 spider_singtao.BATCH_NUMBER = util.get_day_stamp() + 10060 spider_singtao.OFFSET = offset return spider_singtao
def get_auto_configured_spider(cls, offset=0): hkcd_seed = {'http://hk.hkcd.com/node_25195.htm'} util.add_hrefs(url='http://hk.hkcd.com/node_30602.htm', selectors={'a'}, seeds=hkcd_seed, seed_patterns={node_pattern}, prefix='http://hk.hkcd.com/') hkcd_seed.add('http://www.hkcd.com/') hk_cat_dict = _get_hk_cat_dict(hkcd_seed, {content_pattern}) cat_dict = _get_cat_dict( {'http://www.hkcd.com/content/2016-07/18/content_1008717.html'}) current_day_sting = util.get_day_string(offset=offset) day_string = current_day_sting[0:4] + '-' + current_day_sting[ 4:6] + '/' + current_day_sting[6:8] hkcd_reg = { r'http://(www|hk)\.hkcd\.com/content/' + day_string + '/.*' } spider_hkcd = SpiderHKCD('SpiderHKCD', hkcd_seed, hkcd_reg, THREAD_NUM=10, MAX_DEPTH=2) spider_hkcd._hk_cat_dict = hk_cat_dict spider_hkcd._cat_dict = cat_dict spider_hkcd.BATCH_NUMBER = util.get_day_stamp() + 10120 return spider_hkcd
def get_auto_configured_spider(cls, offset=0): dmhk_seed = {'http://news.dmhk.net/'} util.add_hrefs('http://news.dmhk.net/', {'#mega_main_menu_ul a'}, dmhk_seed) dmhk_reg = {ur'http://news\.dmhk\.net/?p=\d+'} spider_dmhk = SpiderDMHK('SpiderDMHK', dmhk_seed, dmhk_reg, THREAD_NUM=5) spider_dmhk.BATCH_NUMBER = util.get_day_stamp() + 10230 return spider_dmhk
def get_auto_configured_spider(cls, offset=0): _852_seed = {'http://www.post852.com/'} util.add_hrefs('http://www.post852.com/', {'#rightnav a'}, _852_seed) spider_852 = Spider852('Spider852', _852_seed, {ur'http://www.post852.com/\d+/.+'}, THREAD_NUM=5) spider_852.OFFSET = offset spider_852.BATCH_NUMBER = util.get_day_stamp() + 10400 return spider_852
def get_auto_configured_spider(cls, offset=0): savantas_seed = {'http://www.savantas.org/'} util.add_hrefs('http://www.savantas.org/', {'#navigation a'}, savantas_seed) spider_savantas = SpiderSavantas('SpiderSavantas', savantas_seed, {ur'http://www.savantas.org/\?p=\d+'}, THREAD_NUM=5) spider_savantas.OFFSET = offset spider_savantas.BATCH_NUMBER = util.get_day_stamp() + 10460 return spider_savantas
def get_auto_configured_spider(cls, offset=0): bbc_seed = {'http://www.hkcna.hk/'} util.add_hrefs('http://www.hkcna.hk/', {'.baner_01 a'}, bbc_seed, news_prefix) day_str = util.get_day_string(offset=offset) day_str = day_str[:4] + '/' + day_str[4:] spider_hkcna = SpiderHKCNA('SpiderHKCNA', bbc_seed, {ur'http://www.hkcna.hk/.+' + day_str + '.+'}, THREAD_NUM=5) spider_hkcna.OFFSET = offset spider_hkcna.BATCH_NUMBER = util.get_day_stamp() + 10370 return spider_hkcna
def get_auto_configured_spider(cls, offset=0): now_seed = {'https://news.now.com/home'} util.add_hrefs('https://news.now.com/home', {'#navBar a'}, now_seed, seed_patterns={re.compile(r'/home/.+')}, prefix=prefix) spider_now = SpiderNow('SpiderNow', now_seed, {ur'https://news\.now\.com/.+newsId=\d+.+'}, THREAD_NUM=10) spider_now.BATCH_NUMBER = util.get_day_stamp() + 10280 spider_now.OFFSET = offset return spider_now
def get_auto_configured_spider(cls, offset=0): finet_seed = {'http://www2.finet.hk/'} util.add_hrefs(url='http://www2.finet.hk/', selectors={'#mainmenu2 li a'}, seeds=finet_seed) finet_reg = {ur'http://www2\.finet\.hk/Newscenter/news_detail/.+'} spider_finet = SpiderFinet('SpiderFinet', finet_seed, finet_reg, THREAD_NUM=10) spider_finet.BATCH_NUMBER = util.get_day_stamp() + 10250 spider_finet.OFFSET = offset return spider_finet
def get_auto_configured_spider(cls, offset=0): cablenews_seed = { 'http://cablenews.i-cable.com/webapps/index/index.php' } util.add_hrefs('http://cablenews.i-cable.com/webapps/index/index.php', {'#header_web_chi a'}, cablenews_seed) cablenews_reg = {ur'http://.+?\.i-cable\.com/.*videopage.*\d+/.*'} spider_cablenews = SpiderCableNews('SpiderCableNews', cablenews_seed, cablenews_reg, THREAD_NUM=10) spider_cablenews.BATCH_NUMBER = util.get_day_stamp() + 10220 spider_cablenews.OFFSET = offset return spider_cablenews
def get_auto_configured_spider(cls, offset=0): bbc_seed = set() util.add_hrefs('http://www.bbc.com/zhongwen/trad/', {'ul.navigation-wide-list a'}, bbc_seed, news_prefix) bbc_seed.add('http://www.bbc.com/zhongwen/trad/hong_kong_review') day_str = util.get_day_string(offset=offset) day_str = day_str[2:] spider_bbc = SpiderBBC('SpiderBBC', bbc_seed, {ur'http://www.bbc.com/.+' + day_str + '.+'}, THREAD_NUM=5) spider_bbc.OFFSET = offset spider_bbc.BATCH_NUMBER = util.get_day_stamp() + 10360 return spider_bbc
def get_auto_configured_spider(cls, offset=0): tvb_seed = {'http://news.tvb.com/'} util.add_hrefs( url='http://news.tvb.com/', selectors={'#topMenu a'}, seeds=tvb_seed, seed_patterns={re.compile(r'http://news\.tvb\.com/list/\d+/')}) spider_tvb = SpiderTVB('SpiderTVB', tvb_seed, {ur'http://news\.tvb\.com/\w+/[\d\w]{10,}'}, THREAD_NUM=10, MAX_DEPTH=2) spider_tvb.BATCH_NUMBER = util.get_day_stamp() + 10290 spider_tvb.OFFSET = offset return spider_tvb
def get_auto_configured_spider(cls, offset=0): vj_seed = {'http://www.vjmedia.com.hk/'} util.add_hrefs(url='http://www.vjmedia.com.hk/', selectors={'ul.mainnav.dropdown li a'}, seeds=vj_seed) vj_reg = { ur'http://www.vjmedia.com.hk/articles/' + util.get_day_string(interval_str='/', offset=offset) + '/.+' } spider_vj = SpiderVJMedia('SpiderVJMedia', vj_seed, vj_reg, THREAD_NUM=10) spider_vj.OFFSET = offset spider_vj.BATCH_NUMBER = util.get_day_stamp() + 10180 return spider_vj
def get_auto_configured_spider(cls, offset=0): _wsj_seed = {'http://cn.wsj.com/gb/globe.asp'} util.add_hrefs('http://cn.wsj.com/gb/globe.asp', {'#navigation a[target=_top]'}, _wsj_seed) wsj_seed = set() for url in _wsj_seed: if incomplete_cat_url_pattern.match(url): url = news_prefix + url[2:] wsj_seed.add(url) day_str = util.get_day_string(offset=offset) spider_wsj = SpiderWSJ('SpiderWSJ', wsj_seed, {ur'http://cn.wsj.com/gb.+' + day_str + '.+'}, THREAD_NUM=5) spider_wsj.OFFSET = offset spider_wsj.BATCH_NUMBER = util.get_day_stamp() + 10380 return spider_wsj
def get_auto_configured_spider(cls, offset=0): metrofinance_seed = { 'http://www.metroradio.com.hk/104/', 'http://www.metroradio.com.hk/news/live.aspx' } util.add_hrefs('http://www.metroradio.com.hk/104/', {'.toplink2 a'}, metrofinance_seed, prefix=prefix) day_str = util.get_day_string(offset=offset) spider_metrofinance = SpiderMetroFinance( 'SpiderMetroFinance', metrofinance_seed, {ur'http://www\.metroradio\.com\.hk/.+' + day_str + '.+'}, THREAD_NUM=10) spider_metrofinance.OFFSET = offset spider_metrofinance.BATCH_NUMBER = util.get_day_stamp() + 10410 return spider_metrofinance
def get_auto_configured_spider(cls, offset=0): xinhua_seed = { 'http://www.news.cn/gangao/index.htm', 'http://www.news.cn/gangao/jsxw.htm' } util.add_hrefs('http://www.news.cn/gangao/index.htm', {'.nav.domPC a'}, xinhua_seed, news_prefix) day_str = util.get_day_string('-', offset=offset) day_str = day_str[:-3] + '/' + day_str[-2:] spider_xinhua = SpiderXinhua( 'SpiderXinhua', xinhua_seed, {ur'http://news\.xinhuanet\.com/gangao/' + day_str + '.+'}, THREAD_NUM=5) spider_xinhua.OFFSET = offset spider_xinhua.BATCH_NUMBER = util.get_day_stamp() + 10430 return spider_xinhua
def get_auto_configured_spider(cls, offset=0): initium_seed = {'https://theinitium.com/'} util.add_hrefs(url='https://theinitium.com/', selectors={'div.left-nav-top li a'}, seeds=initium_seed, prefix=prefix) initium_reg = { ur'https://theinitium\.com/article/' + util.get_day_string(offset=offset) + '-.+', ur'http://feeds\.initium.+' } spider_initium = SpiderInitium('SpiderInitium', initium_seed, initium_reg, THREAD_NUM=10) spider_initium.BATCH_NUMBER = util.get_day_stamp() + 10190 spider_initium.OFFSET = offset return spider_initium
def get_auto_configured_spider(cls, offset=1): day_str = util.get_day_string('.', 'inverse', offset=offset) commercial_seed = { 'http://www.881903.com/Page/ZH-TW/news.aspx?sdate=' + day_str + '&csid=261_341' } util.add_hrefs(url='http://www.881903.com/Page/ZH-TW/news.aspx?' + day_str + '&csid=261_341', seeds=commercial_seed, selectors={'#newsCategoryTab a'}, prefix=cat_prefix) _seed = copy.deepcopy(commercial_seed) for seed in _seed: if cat_page_pattern.match(seed): r = util.get_safe_response(seed) if r: d = pq(r.text) if re.findall(total_page_pattern, d('.Font_Article_CH').text()): total_page = int( re.findall(total_page_pattern, d('.Font_Article_CH').text())[0]) for i in range(total_page): commercial_seed.add(seed + '&page=' + str(i + 1)) '''' r = requests.get('http://www.881903.com/Page/ZH-TW/index.aspx') d = pq(r.text) for a in d('.header2012 ul li a').items(): if a.attr('href'): u = a.attr('href') if not complete_pattern.match(u): if incomplete_pattern.match(u): u = prefix + u commercial_seed.add(u) ''' commercial_reg = {ur'http://www\.881903\.com/.+detail.*'} spider_commercial = SpiderCommercialRadio('SpiderCommercialRadio', commercial_seed, commercial_reg, THREAD_NUM=10) spider_commercial.BATCH_NUMBER = util.get_day_stamp() + 10260 spider_commercial.OFFSET = offset # spider_commercial.MAX_DEPTH = 5 return spider_commercial