예제 #1
0
 def get_auto_configured_spider(cls, offset=0):
     day_str = util.get_day_string(offset=offset)
     hket_seed = {'http://inews.hket.com/sran001/%E5%85%A8%E9%83%A8?dis=' + day_str}
     spider_hket = SpiderHKET('SpiderHKET', hket_seed, {ur'http://.+\.hket\.com/article/\d+/.*'}, THREAD_NUM=5)
     spider_hket.BATCH_NUMBER = util.get_day_stamp(offset=offset) + 10110
     spider_hket.OFFSET = offset
     return spider_hket
예제 #2
0
 def get_auto_configured_spider(cls, offset=0):
     dmhk_seed = {'http://news.dmhk.net/'}
     util.add_hrefs('http://news.dmhk.net/', {'#mega_main_menu_ul a'}, dmhk_seed)
     dmhk_reg = {ur'http://news\.dmhk\.net/?p=\d+'}
     spider_dmhk = SpiderDMHK('SpiderDMHK', dmhk_seed, dmhk_reg, THREAD_NUM=5)
     spider_dmhk.BATCH_NUMBER = util.get_day_stamp() + 10230
     return spider_dmhk
예제 #3
0
 def get_auto_configured_spider(cls, offset=0):
     hkcd_seed = {'http://hk.hkcd.com/node_25195.htm'}
     util.add_hrefs(url='http://hk.hkcd.com/node_30602.htm',
                    selectors={'a'},
                    seeds=hkcd_seed,
                    seed_patterns={node_pattern},
                    prefix='http://hk.hkcd.com/')
     hkcd_seed.add('http://www.hkcd.com/')
     hk_cat_dict = _get_hk_cat_dict(hkcd_seed, {content_pattern})
     cat_dict = _get_cat_dict(
         {'http://www.hkcd.com/content/2016-07/18/content_1008717.html'})
     current_day_sting = util.get_day_string(offset=offset)
     day_string = current_day_sting[0:4] + '-' + current_day_sting[
         4:6] + '/' + current_day_sting[6:8]
     hkcd_reg = {
         r'http://(www|hk)\.hkcd\.com/content/' + day_string + '/.*'
     }
     spider_hkcd = SpiderHKCD('SpiderHKCD',
                              hkcd_seed,
                              hkcd_reg,
                              THREAD_NUM=10,
                              MAX_DEPTH=2)
     spider_hkcd._hk_cat_dict = hk_cat_dict
     spider_hkcd._cat_dict = cat_dict
     spider_hkcd.BATCH_NUMBER = util.get_day_stamp() + 10120
     return spider_hkcd
예제 #4
0
 def get_auto_configured_spider(cls, offset=0):
     cats = dict()
     cats['105'] = '海外綜合'
     cats['102'] = '新聞專輯'
     singtao_seed = {'http://std.stheadline.com/'}
     util.add_hrefs('http://std.stheadline.com/',
                    selectors={'#navbar ul.nav li.has-children > a'},
                    seeds=singtao_seed)
     '''
     id_pattern = re.compile(ur'(?<=php\?cat=)\d+')
     for cat in d('ul.sub-menu a').items():
         if re.findall(id_pattern, cat.attr('href')):
             cats[re.findall(id_pattern, cat.attr('href'))[0]] = cat.text()
         if instant_cat_pattern.match(cat.attr('href')):
             singtao_seed.add(cat.attr('href'))
     for k, v in cats.iteritems():
         singtao_seed.add('http://std.stheadline.com/daily/section-list.php?cat=' + k)
     '''
     spider_singtao = SpiderSingTao(
         'SpiderSingTao',
         singtao_seed, {
             ur'(http://std\.stheadline\.com/daily/news-content\.php.*)|(http://std\.stheadline\.com/instant/articles/detail/\d+.*)'
         },
         THREAD_NUM=1,
         cats=cats)
     spider_singtao.FETCH_DELAY = 0.5
     spider_singtao.BATCH_NUMBER = util.get_day_stamp() + 10060
     spider_singtao.OFFSET = offset
     return spider_singtao
예제 #5
0
 def get_auto_configured_spider(cls, offset=0):
     gov_seed = set()
     gov_seed.add('http://www.news.gov.hk/tc/index.shtml')
     gov_r = requests.get('http://www.news.gov.hk/tc/index.shtml')
     gov_index = pq(gov_r.text)
     gov_seed.add('http://archive.news.gov.hk/tc/city_life/html/' +
                  str(time.localtime().tm_year) + '/' +
                  ('%02d' % time.localtime().tm_mon) + '/index.shtml')
     gov_seed.add('http://archive.news.gov.hk/tc/record/html/' +
                  str(time.localtime().tm_year) + '/' +
                  ('%02d' % time.localtime().tm_mon) + '/index.shtml')
     gov_sub_cats = gov_index('div[id=subnav] a').items()
     for a in gov_sub_cats:
         url_a = a.attr('href')
         url_a = 'http://archive.news.gov.hk' + url_a[:-11] + 'html/' + str(
             time.localtime().tm_year) + '/' + (
                 '%02d' % time.localtime().tm_mon) + '/index.shtml'
         gov_seed.add(url_a)
     spider_gov = SpiderGov(
         'SpiderGov',
         gov_seed,
         {ur'http://.*\.news\.gov\.hk/tc/.*/\d\d\d\d/\d\d/\d+_\d+.*'},
         THREAD_NUM=10,
         MAX_DEPTH=2)
     spider_gov.AUTO_FILTER_URL_PARAS = True
     spider_gov.OFFSET = offset
     spider_gov.BATCH_NUMBER = util.get_day_stamp() + 10004
     return spider_gov
예제 #6
0
 def get_auto_configured_spider(cls, offset=0, **kwargs):
     day_str = util.get_day_string(offset=offset)
     issue_dict_url = 'http://news.mingpao.com/dat/pns/issuelist.js?819181'
     r = requests.get(issue_dict_url)
     json_obj = json.loads(r.text)
     if '1 ' + day_str in json_obj['PNS_WEB_TC']:
         issue_id = json_obj['PNS_WEB_TC']['1 ' + day_str]['E']
     else:
         print 'KEY ERROR: ' + json_obj['PNS_WEB_TC']['1 ' + day_str]
         return
     news_list_url = 'http://news.mingpao.com/dat/pns/pns_web_tc/feed1/' + day_str + issue_id + '/content.js'
     mingpao_seed = set()
     r = requests.get(news_list_url)
     if re.findall(r'feed_module_2', r.text):
         news_list_data = news_list_data_pattern.findall(r.text)[0]
         json_obj = json.loads(news_list_data)
         for it in json_obj['rss']['channel']['item']:
             mingpao_seed.add(
                 'http://news.mingpao.com/dat/pns/pns_web_tc/article1/' + day_str + issue_id.lower() + '/todaycontent_' + str(
                     it['ATTRIBUTES']['NODEID']) + '.js')
     mingpao_reg = {ur'http://news\.mingpao\.com/dat/pns/.*' + day_str + '.+'}
     spider_mingpao = SpiderMingPao('SpiderMingPao', mingpao_seed, mingpao_reg, THREAD_NUM=10)
     spider_mingpao.OFFSET = offset
     spider_mingpao.BATCH_NUMBER = util.get_day_stamp(offset) + 10570
     return spider_mingpao
예제 #7
0
 def page_filter(self, doc, url):
     wanted = False
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             t_stamp = util.get_timestamp_from_selectors(doc=doc, selectors=selectors, date_patterns=date_patterns)
             if t_stamp >= util.get_day_stamp(self.OFFSET):
                 wanted = True
     return wanted
예제 #8
0
 def task_filter(self, doc, url, doc_url):
     wanted = False
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             t = full_article_pattern.findall(url)[0]
             if util.get_timestamp_from_string(t, '%m-%d-%Y') >= util.get_day_stamp(offset=self.OFFSET):
                 wanted = True
     return wanted
예제 #9
0
 def page_filter(self, doc, url):
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             t = doc('.updated').text()
             t_stamp = util.get_timestamp_from_string(t)
             if t_stamp >= util.get_day_stamp(self.OFFSET):
                 return True
             return False
     return False
예제 #10
0
 def get_auto_configured_spider(cls, offset=0):
     _852_seed = {'http://www.post852.com/'}
     util.add_hrefs('http://www.post852.com/', {'#rightnav a'}, _852_seed)
     spider_852 = Spider852('Spider852',
                            _852_seed, {ur'http://www.post852.com/\d+/.+'},
                            THREAD_NUM=5)
     spider_852.OFFSET = offset
     spider_852.BATCH_NUMBER = util.get_day_stamp() + 10400
     return spider_852
예제 #11
0
 def get_auto_configured_spider(cls, offset=0):
     y28_seed = {'http://www.y28club.com/y28news/cgi-bin/news/newshome.pl'}
     spider_y28 = SpiderY28(
         'SpiderY28',
         y28_seed, {ur'http://www\.y28club\.com/y28news/cgi-bin/news/.+'},
         THREAD_NUM=10)
     spider_y28.BATCH_NUMBER = util.get_day_stamp() + 10530
     spider_y28.OFFSET = offset
     return spider_y28
예제 #12
0
 def get_auto_configured_spider(cls, offset=0):
     reuters_seed = {'http://cn.reuters.com/markets/hongkong'}
     spider_reuters = SpiderReuters(
         'SpiderReuters',
         reuters_seed, {ur'http://cn\.reuters\.com/article/.+'},
         THREAD_NUM=10)
     spider_reuters.BATCH_NUMBER = util.get_day_stamp() + 10560
     spider_reuters.OFFSET = offset
     return spider_reuters
예제 #13
0
 def get_auto_configured_spider(cls, offset=0):
     edb_seed = {'http://www.edb.gov.hk/tc/news/all.html'}
     spider_edb = SpiderEDB('SpiderEDB',
                            edb_seed,
                            {ur'http://www\.edb\.gov\.hk/.+\.htm.*'},
                            THREAD_NUM=5)
     spider_edb.OFFSET = offset
     spider_edb.BATCH_NUMBER = util.get_day_stamp() + 10470
     return spider_edb
예제 #14
0
 def get_auto_configured_spider(cls, offset=0):
     newcent_seed = {'http://www.ncforum.org.hk/news'}
     spider_newcent = SpiderNewCenturyForum('SpiderNewCenturyForum',
                                            newcent_seed,
                                            {ur'http://www\.ncforum\.org\.hk/news/.+'},
                                            THREAD_NUM=5)
     spider_newcent.OFFSET = offset
     spider_newcent.BATCH_NUMBER = util.get_day_stamp() + 10450
     return spider_newcent
예제 #15
0
 def get_auto_configured_spider(cls, offset=0):
     unwire_seed = {'https://unwire.hk/articles/page/1/'}
     spider_unwire = SpiderUnwire('SpiderUnwire',
                                  unwire_seed,
                                  {ur'https\://unwire\.hk/\d{4}/\d\d/\d\d/.+'},
                                  THREAD_NUM=10, MAX_DEPTH=2)
     spider_unwire.BATCH_NUMBER = util.get_day_stamp() + 10680
     spider_unwire.OFFSET = offset
     return spider_unwire
예제 #16
0
 def get_auto_configured_spider(cls, offset=0):
     jd_seed = {'http://www.jdonline.com.hk/index/index.php'}
     spider_jd = SpiderJD(
         'SpiderJD',
         jd_seed, {ur'http://www\.jdonline\.com\.hk.+\?news_id=\d+.+'},
         THREAD_NUM=10)
     spider_jd.BATCH_NUMBER = util.get_day_stamp() + 10540
     spider_jd.OFFSET = offset
     return spider_jd
예제 #17
0
 def page_filter(self, doc, url):
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             t = doc('div[itemprop="dateCreated"]').attr('datetime')
             t_stamp = util.get_timestamp_from_string(t) + 8 * 3600
             if t_stamp >= util.get_day_stamp(self.OFFSET):
                 return True
             return False
     return False
예제 #18
0
 def get_auto_configured_spider(cls, offset=0):
     speakout_seed = {'http://speakout.hk/'}
     spider_speakout = SpiderSpeakout('SpiderSpeakout',
                                      speakout_seed,
                                      {ur'http://.*\d\d\d\d-\d\d-\d\d.+'},
                                      THREAD_NUM=10)
     spider_speakout.BATCH_NUMBER = util.get_day_stamp() + 10520
     spider_speakout.OFFSET = offset
     return spider_speakout
예제 #19
0
 def page_filter(self, doc, url):
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             t = util.get_time_string_from_selectors(doc, {'span.time'})
             t_stamp = util.get_timestamp_from_string(t)
             if t_stamp >= util.get_day_stamp(self.OFFSET):
                 return True
             return False
     return False
예제 #20
0
 def get_auto_configured_spider(cls, offset=0):
     hkej_seed = {'http://www2.hkej.com/instantnews', 'http://www.hkej.com/template/landing11/jsp/main.jsp', 'http://www1.hkej.com/dailynews/toc?date='+util.get_day_string('-', offset=offset)}
     # util.add_hrefs('http://www.hkej.com/template/landing11/jsp/main.jsp', {'a'}, hkej_seed, seed_patterns={re.compile(ur'http://www.*hkej\.com/.+')})
     # ** currently the reg of the pages is only for 'instant news'
     hkej_reg = {ur'http://www.*?\.hkej\.com/instantnews.*article/.+', ur'http://www1\.hkej\.com/.*dailynews/.*article/.+'}
     spider_hkej = SpiderHKEJ('SpiderHKEJ', hkej_seed, hkej_reg, THREAD_NUM=10, MAX_DEPTH=1)
     spider_hkej.BATCH_NUMBER = util.get_day_stamp() + 10150
     spider_hkej.OFFSET = offset
     return spider_hkej
예제 #21
0
 def _time_filter(self, doc, url):
     if doc is self._doc:
         return self._doc_wanted
     else:
         t_stamp = util.get_timestamp_from_selectors(doc=doc, selectors=selectors, date_patterns=date_patterns)
         if t_stamp >= util.get_day_stamp(self.OFFSET) or t_stamp == 0:
             self._doc_wanted = True
             return True
         self._doc_wanted = False
         return False
예제 #22
0
 def page_filter(self, doc, url):
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             meta_txt = doc('.metaStuff').text()
             t = re.findall(ur'[^\s]+月.+', meta_txt)[0]
             t_stamp = util.get_timestamp_from_string(t)
             if t_stamp >= util.get_day_stamp(self.OFFSET):
                 return True
             return False
     return False
예제 #23
0
 def page_filter(self, doc, url):
     wanted = False
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             if doc('div.post_time'):
                 t = util.get_time_string_from_selectors(doc, {'div.post_time'})
                 t_stamp = util.get_timestamp_from_string(t)
                 if t_stamp >= util.get_day_stamp():
                     return True
     return wanted
예제 #24
0
 def page_filter(self, doc, url):
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             t = doc('meta[property="article:published_time"]').attr(
                 'content')
             t_stamp = util.get_timestamp_from_string(t)
             if t_stamp >= util.get_day_stamp(self.OFFSET):
                 return True
             return False
     return False
예제 #25
0
 def page_filter(self, doc, url):
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             with self.url_time_dict_lock:
                 if url in self.url_time_dict:
                     t_stamp = self.url_time_dict[url]
                     if t_stamp >= util.get_day_stamp(self.OFFSET):
                         return True
                     return False
     return False
예제 #26
0
 def page_filter(self, doc, url):
     t_stamp = util.get_day_stamp(self.OFFSET)
     wanted = False
     for reg_pattern in self.reg_patterns:
         if reg_pattern.match(url):
             if doc('meta[name=artpdate]').attr('content'):
                 if int(time.mktime(time.strptime(doc('meta[name=artpdate]').attr('content'),
                                                  "%Y-%m-%d %H:%M:%S"))) >= t_stamp:
                     wanted = True
     return wanted
예제 #27
0
 def get_auto_configured_spider(cls, offset=0):
     eastweek_seed = {'http://eastweek.my-magazine.me/main/'}
     eastweek_reg = {ur'http://eastweek\.my-magazine\.me/main/\d+'}
     spider_eastweek = SpiderEastWeek('SpiderEastWeek',
                                      eastweek_seed,
                                      eastweek_reg,
                                      THREAD_NUM=10)
     spider_eastweek.BATCH_NUMBER = util.get_day_stamp() + 10240
     spider_eastweek.OFFSET = offset
     return spider_eastweek
예제 #28
0
 def get_auto_configured_spider(cls, offset=0):
     day_str = util.get_day_string(offset=offset)
     apple_seed = {'http://www.hksilicon.com/?page=1'}
     spider_hksilicon = SpiderHKSilicon(
         'SpiderHKSilicon',
         apple_seed, {ur'http\://www\.hksilicon\.com/articles/\d+'},
         THREAD_NUM=15)
     spider_hksilicon.BATCH_NUMBER = util.get_day_stamp() + 10690
     spider_hksilicon.OFFSET = offset
     return spider_hksilicon
예제 #29
0
 def get_auto_configured_spider(cls, offset=0):
     scmp_seed = {'http://www.scmp.com/news/hong-kong'}
     spider_scmp = SpiderSCMP(
         'SpiderSCMP',
         scmp_seed,
         {ur'http://www\.scmp\.com/news/hong-kong/.*article/\d+/.*'},
         THREAD_NUM=10)
     spider_scmp.BATCH_NUMBER = util.get_day_stamp() + 10550
     spider_scmp.OFFSET = offset
     return spider_scmp
예제 #30
0
 def get_auto_configured_spider(cls, offset=0):
     ubeat_seed = {'http://ubeat.com.cuhk.edu.hk/'}
     # day_str = util.get_day_string(offset=offset)
     spider_ubeat = SpiderUBeat('SpiderUBeat',
                            ubeat_seed,
                            {ur'http://ubeat\.com\.cuhk\.edu\.hk/\d+.+', ur'http://ubeat\.com\.cuhk\.edu\.hk/mm_.+'},
                            THREAD_NUM=5)
     spider_ubeat.OFFSET = offset
     spider_ubeat.BATCH_NUMBER = util.get_day_stamp() + 10630
     return spider_ubeat