def get_auto_configured_spider(cls, offset=0): day_str = util.get_day_string(offset=offset) hket_seed = {'http://inews.hket.com/sran001/%E5%85%A8%E9%83%A8?dis=' + day_str} spider_hket = SpiderHKET('SpiderHKET', hket_seed, {ur'http://.+\.hket\.com/article/\d+/.*'}, THREAD_NUM=5) spider_hket.BATCH_NUMBER = util.get_day_stamp(offset=offset) + 10110 spider_hket.OFFSET = offset return spider_hket
def get_auto_configured_spider(cls, offset=0): dmhk_seed = {'http://news.dmhk.net/'} util.add_hrefs('http://news.dmhk.net/', {'#mega_main_menu_ul a'}, dmhk_seed) dmhk_reg = {ur'http://news\.dmhk\.net/?p=\d+'} spider_dmhk = SpiderDMHK('SpiderDMHK', dmhk_seed, dmhk_reg, THREAD_NUM=5) spider_dmhk.BATCH_NUMBER = util.get_day_stamp() + 10230 return spider_dmhk
def get_auto_configured_spider(cls, offset=0): hkcd_seed = {'http://hk.hkcd.com/node_25195.htm'} util.add_hrefs(url='http://hk.hkcd.com/node_30602.htm', selectors={'a'}, seeds=hkcd_seed, seed_patterns={node_pattern}, prefix='http://hk.hkcd.com/') hkcd_seed.add('http://www.hkcd.com/') hk_cat_dict = _get_hk_cat_dict(hkcd_seed, {content_pattern}) cat_dict = _get_cat_dict( {'http://www.hkcd.com/content/2016-07/18/content_1008717.html'}) current_day_sting = util.get_day_string(offset=offset) day_string = current_day_sting[0:4] + '-' + current_day_sting[ 4:6] + '/' + current_day_sting[6:8] hkcd_reg = { r'http://(www|hk)\.hkcd\.com/content/' + day_string + '/.*' } spider_hkcd = SpiderHKCD('SpiderHKCD', hkcd_seed, hkcd_reg, THREAD_NUM=10, MAX_DEPTH=2) spider_hkcd._hk_cat_dict = hk_cat_dict spider_hkcd._cat_dict = cat_dict spider_hkcd.BATCH_NUMBER = util.get_day_stamp() + 10120 return spider_hkcd
def get_auto_configured_spider(cls, offset=0): cats = dict() cats['105'] = '海外綜合' cats['102'] = '新聞專輯' singtao_seed = {'http://std.stheadline.com/'} util.add_hrefs('http://std.stheadline.com/', selectors={'#navbar ul.nav li.has-children > a'}, seeds=singtao_seed) ''' id_pattern = re.compile(ur'(?<=php\?cat=)\d+') for cat in d('ul.sub-menu a').items(): if re.findall(id_pattern, cat.attr('href')): cats[re.findall(id_pattern, cat.attr('href'))[0]] = cat.text() if instant_cat_pattern.match(cat.attr('href')): singtao_seed.add(cat.attr('href')) for k, v in cats.iteritems(): singtao_seed.add('http://std.stheadline.com/daily/section-list.php?cat=' + k) ''' spider_singtao = SpiderSingTao( 'SpiderSingTao', singtao_seed, { ur'(http://std\.stheadline\.com/daily/news-content\.php.*)|(http://std\.stheadline\.com/instant/articles/detail/\d+.*)' }, THREAD_NUM=1, cats=cats) spider_singtao.FETCH_DELAY = 0.5 spider_singtao.BATCH_NUMBER = util.get_day_stamp() + 10060 spider_singtao.OFFSET = offset return spider_singtao
def get_auto_configured_spider(cls, offset=0): gov_seed = set() gov_seed.add('http://www.news.gov.hk/tc/index.shtml') gov_r = requests.get('http://www.news.gov.hk/tc/index.shtml') gov_index = pq(gov_r.text) gov_seed.add('http://archive.news.gov.hk/tc/city_life/html/' + str(time.localtime().tm_year) + '/' + ('%02d' % time.localtime().tm_mon) + '/index.shtml') gov_seed.add('http://archive.news.gov.hk/tc/record/html/' + str(time.localtime().tm_year) + '/' + ('%02d' % time.localtime().tm_mon) + '/index.shtml') gov_sub_cats = gov_index('div[id=subnav] a').items() for a in gov_sub_cats: url_a = a.attr('href') url_a = 'http://archive.news.gov.hk' + url_a[:-11] + 'html/' + str( time.localtime().tm_year) + '/' + ( '%02d' % time.localtime().tm_mon) + '/index.shtml' gov_seed.add(url_a) spider_gov = SpiderGov( 'SpiderGov', gov_seed, {ur'http://.*\.news\.gov\.hk/tc/.*/\d\d\d\d/\d\d/\d+_\d+.*'}, THREAD_NUM=10, MAX_DEPTH=2) spider_gov.AUTO_FILTER_URL_PARAS = True spider_gov.OFFSET = offset spider_gov.BATCH_NUMBER = util.get_day_stamp() + 10004 return spider_gov
def get_auto_configured_spider(cls, offset=0, **kwargs): day_str = util.get_day_string(offset=offset) issue_dict_url = 'http://news.mingpao.com/dat/pns/issuelist.js?819181' r = requests.get(issue_dict_url) json_obj = json.loads(r.text) if '1 ' + day_str in json_obj['PNS_WEB_TC']: issue_id = json_obj['PNS_WEB_TC']['1 ' + day_str]['E'] else: print 'KEY ERROR: ' + json_obj['PNS_WEB_TC']['1 ' + day_str] return news_list_url = 'http://news.mingpao.com/dat/pns/pns_web_tc/feed1/' + day_str + issue_id + '/content.js' mingpao_seed = set() r = requests.get(news_list_url) if re.findall(r'feed_module_2', r.text): news_list_data = news_list_data_pattern.findall(r.text)[0] json_obj = json.loads(news_list_data) for it in json_obj['rss']['channel']['item']: mingpao_seed.add( 'http://news.mingpao.com/dat/pns/pns_web_tc/article1/' + day_str + issue_id.lower() + '/todaycontent_' + str( it['ATTRIBUTES']['NODEID']) + '.js') mingpao_reg = {ur'http://news\.mingpao\.com/dat/pns/.*' + day_str + '.+'} spider_mingpao = SpiderMingPao('SpiderMingPao', mingpao_seed, mingpao_reg, THREAD_NUM=10) spider_mingpao.OFFSET = offset spider_mingpao.BATCH_NUMBER = util.get_day_stamp(offset) + 10570 return spider_mingpao
def page_filter(self, doc, url): wanted = False for reg_pattern in self.reg_patterns: if reg_pattern.match(url): t_stamp = util.get_timestamp_from_selectors(doc=doc, selectors=selectors, date_patterns=date_patterns) if t_stamp >= util.get_day_stamp(self.OFFSET): wanted = True return wanted
def task_filter(self, doc, url, doc_url): wanted = False for reg_pattern in self.reg_patterns: if reg_pattern.match(url): t = full_article_pattern.findall(url)[0] if util.get_timestamp_from_string(t, '%m-%d-%Y') >= util.get_day_stamp(offset=self.OFFSET): wanted = True return wanted
def page_filter(self, doc, url): for reg_pattern in self.reg_patterns: if reg_pattern.match(url): t = doc('.updated').text() t_stamp = util.get_timestamp_from_string(t) if t_stamp >= util.get_day_stamp(self.OFFSET): return True return False return False
def get_auto_configured_spider(cls, offset=0): _852_seed = {'http://www.post852.com/'} util.add_hrefs('http://www.post852.com/', {'#rightnav a'}, _852_seed) spider_852 = Spider852('Spider852', _852_seed, {ur'http://www.post852.com/\d+/.+'}, THREAD_NUM=5) spider_852.OFFSET = offset spider_852.BATCH_NUMBER = util.get_day_stamp() + 10400 return spider_852
def get_auto_configured_spider(cls, offset=0): y28_seed = {'http://www.y28club.com/y28news/cgi-bin/news/newshome.pl'} spider_y28 = SpiderY28( 'SpiderY28', y28_seed, {ur'http://www\.y28club\.com/y28news/cgi-bin/news/.+'}, THREAD_NUM=10) spider_y28.BATCH_NUMBER = util.get_day_stamp() + 10530 spider_y28.OFFSET = offset return spider_y28
def get_auto_configured_spider(cls, offset=0): reuters_seed = {'http://cn.reuters.com/markets/hongkong'} spider_reuters = SpiderReuters( 'SpiderReuters', reuters_seed, {ur'http://cn\.reuters\.com/article/.+'}, THREAD_NUM=10) spider_reuters.BATCH_NUMBER = util.get_day_stamp() + 10560 spider_reuters.OFFSET = offset return spider_reuters
def get_auto_configured_spider(cls, offset=0): edb_seed = {'http://www.edb.gov.hk/tc/news/all.html'} spider_edb = SpiderEDB('SpiderEDB', edb_seed, {ur'http://www\.edb\.gov\.hk/.+\.htm.*'}, THREAD_NUM=5) spider_edb.OFFSET = offset spider_edb.BATCH_NUMBER = util.get_day_stamp() + 10470 return spider_edb
def get_auto_configured_spider(cls, offset=0): newcent_seed = {'http://www.ncforum.org.hk/news'} spider_newcent = SpiderNewCenturyForum('SpiderNewCenturyForum', newcent_seed, {ur'http://www\.ncforum\.org\.hk/news/.+'}, THREAD_NUM=5) spider_newcent.OFFSET = offset spider_newcent.BATCH_NUMBER = util.get_day_stamp() + 10450 return spider_newcent
def get_auto_configured_spider(cls, offset=0): unwire_seed = {'https://unwire.hk/articles/page/1/'} spider_unwire = SpiderUnwire('SpiderUnwire', unwire_seed, {ur'https\://unwire\.hk/\d{4}/\d\d/\d\d/.+'}, THREAD_NUM=10, MAX_DEPTH=2) spider_unwire.BATCH_NUMBER = util.get_day_stamp() + 10680 spider_unwire.OFFSET = offset return spider_unwire
def get_auto_configured_spider(cls, offset=0): jd_seed = {'http://www.jdonline.com.hk/index/index.php'} spider_jd = SpiderJD( 'SpiderJD', jd_seed, {ur'http://www\.jdonline\.com\.hk.+\?news_id=\d+.+'}, THREAD_NUM=10) spider_jd.BATCH_NUMBER = util.get_day_stamp() + 10540 spider_jd.OFFSET = offset return spider_jd
def page_filter(self, doc, url): for reg_pattern in self.reg_patterns: if reg_pattern.match(url): t = doc('div[itemprop="dateCreated"]').attr('datetime') t_stamp = util.get_timestamp_from_string(t) + 8 * 3600 if t_stamp >= util.get_day_stamp(self.OFFSET): return True return False return False
def get_auto_configured_spider(cls, offset=0): speakout_seed = {'http://speakout.hk/'} spider_speakout = SpiderSpeakout('SpiderSpeakout', speakout_seed, {ur'http://.*\d\d\d\d-\d\d-\d\d.+'}, THREAD_NUM=10) spider_speakout.BATCH_NUMBER = util.get_day_stamp() + 10520 spider_speakout.OFFSET = offset return spider_speakout
def page_filter(self, doc, url): for reg_pattern in self.reg_patterns: if reg_pattern.match(url): t = util.get_time_string_from_selectors(doc, {'span.time'}) t_stamp = util.get_timestamp_from_string(t) if t_stamp >= util.get_day_stamp(self.OFFSET): return True return False return False
def get_auto_configured_spider(cls, offset=0): hkej_seed = {'http://www2.hkej.com/instantnews', 'http://www.hkej.com/template/landing11/jsp/main.jsp', 'http://www1.hkej.com/dailynews/toc?date='+util.get_day_string('-', offset=offset)} # util.add_hrefs('http://www.hkej.com/template/landing11/jsp/main.jsp', {'a'}, hkej_seed, seed_patterns={re.compile(ur'http://www.*hkej\.com/.+')}) # ** currently the reg of the pages is only for 'instant news' hkej_reg = {ur'http://www.*?\.hkej\.com/instantnews.*article/.+', ur'http://www1\.hkej\.com/.*dailynews/.*article/.+'} spider_hkej = SpiderHKEJ('SpiderHKEJ', hkej_seed, hkej_reg, THREAD_NUM=10, MAX_DEPTH=1) spider_hkej.BATCH_NUMBER = util.get_day_stamp() + 10150 spider_hkej.OFFSET = offset return spider_hkej
def _time_filter(self, doc, url): if doc is self._doc: return self._doc_wanted else: t_stamp = util.get_timestamp_from_selectors(doc=doc, selectors=selectors, date_patterns=date_patterns) if t_stamp >= util.get_day_stamp(self.OFFSET) or t_stamp == 0: self._doc_wanted = True return True self._doc_wanted = False return False
def page_filter(self, doc, url): for reg_pattern in self.reg_patterns: if reg_pattern.match(url): meta_txt = doc('.metaStuff').text() t = re.findall(ur'[^\s]+月.+', meta_txt)[0] t_stamp = util.get_timestamp_from_string(t) if t_stamp >= util.get_day_stamp(self.OFFSET): return True return False return False
def page_filter(self, doc, url): wanted = False for reg_pattern in self.reg_patterns: if reg_pattern.match(url): if doc('div.post_time'): t = util.get_time_string_from_selectors(doc, {'div.post_time'}) t_stamp = util.get_timestamp_from_string(t) if t_stamp >= util.get_day_stamp(): return True return wanted
def page_filter(self, doc, url): for reg_pattern in self.reg_patterns: if reg_pattern.match(url): t = doc('meta[property="article:published_time"]').attr( 'content') t_stamp = util.get_timestamp_from_string(t) if t_stamp >= util.get_day_stamp(self.OFFSET): return True return False return False
def page_filter(self, doc, url): for reg_pattern in self.reg_patterns: if reg_pattern.match(url): with self.url_time_dict_lock: if url in self.url_time_dict: t_stamp = self.url_time_dict[url] if t_stamp >= util.get_day_stamp(self.OFFSET): return True return False return False
def page_filter(self, doc, url): t_stamp = util.get_day_stamp(self.OFFSET) wanted = False for reg_pattern in self.reg_patterns: if reg_pattern.match(url): if doc('meta[name=artpdate]').attr('content'): if int(time.mktime(time.strptime(doc('meta[name=artpdate]').attr('content'), "%Y-%m-%d %H:%M:%S"))) >= t_stamp: wanted = True return wanted
def get_auto_configured_spider(cls, offset=0): eastweek_seed = {'http://eastweek.my-magazine.me/main/'} eastweek_reg = {ur'http://eastweek\.my-magazine\.me/main/\d+'} spider_eastweek = SpiderEastWeek('SpiderEastWeek', eastweek_seed, eastweek_reg, THREAD_NUM=10) spider_eastweek.BATCH_NUMBER = util.get_day_stamp() + 10240 spider_eastweek.OFFSET = offset return spider_eastweek
def get_auto_configured_spider(cls, offset=0): day_str = util.get_day_string(offset=offset) apple_seed = {'http://www.hksilicon.com/?page=1'} spider_hksilicon = SpiderHKSilicon( 'SpiderHKSilicon', apple_seed, {ur'http\://www\.hksilicon\.com/articles/\d+'}, THREAD_NUM=15) spider_hksilicon.BATCH_NUMBER = util.get_day_stamp() + 10690 spider_hksilicon.OFFSET = offset return spider_hksilicon
def get_auto_configured_spider(cls, offset=0): scmp_seed = {'http://www.scmp.com/news/hong-kong'} spider_scmp = SpiderSCMP( 'SpiderSCMP', scmp_seed, {ur'http://www\.scmp\.com/news/hong-kong/.*article/\d+/.*'}, THREAD_NUM=10) spider_scmp.BATCH_NUMBER = util.get_day_stamp() + 10550 spider_scmp.OFFSET = offset return spider_scmp
def get_auto_configured_spider(cls, offset=0): ubeat_seed = {'http://ubeat.com.cuhk.edu.hk/'} # day_str = util.get_day_string(offset=offset) spider_ubeat = SpiderUBeat('SpiderUBeat', ubeat_seed, {ur'http://ubeat\.com\.cuhk\.edu\.hk/\d+.+', ur'http://ubeat\.com\.cuhk\.edu\.hk/mm_.+'}, THREAD_NUM=5) spider_ubeat.OFFSET = offset spider_ubeat.BATCH_NUMBER = util.get_day_stamp() + 10630 return spider_ubeat