def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = '' t = '' t_stamp = 0 category = '' author = '' content = '' if instant_pattern.match(task.url): title = util.get_filtered_title(doc, {'title'}, ur' - 信報網站 hkej.com') t = util.get_time_string_from_selectors(doc, {'span.date'}) time_part = min_sec_pattern.findall(t)[0] t_stamp = util.get_timestamp_from_string(time_part) + time.localtime().tm_sec category = doc('span.cate').text() content = util.get_paragraphs_from_selector(doc, '#article-content p') elif daily_pattern.match(task.url) or headline_article_pattern.match(task.url): title = util.get_filtered_title(doc, {'title'}, ur' - .+') t = util.get_time_string_from_selectors(doc, {'#date'}) t_stamp = util.get_timestamp_from_string(t) + time.localtime().tm_hour*3600 + time.localtime().tm_min*60 + time.localtime().tm_sec category = doc('#hkej_navSubMenu_2014 .on').text() content = util.get_paragraphs_from_selector(doc, '#article-content p') if content == '': content = util.get_paragraphs_from_selector(doc, '#article-detail-wrapper') content = re.sub(ur'(節錄)(.|\n|\t|\r)*', u'', content, re.M | re.I | re.U) item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'HKEJ' item.task_no = self.BATCH_NUMBER for img in doc('#article-detail-wrapper p img, #article-detail-wrapper .hkej_detail_thumb_2014 img').items(): if img.parent('a').attr('href') != '': des = '' if img.parent('a') and img.parent('a').attr('title'): des = img.parent('a').attr('title') media = self.NewsItem.MediaItem(media_url=img.parent('a').attr('href'), type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for a in doc('iframe').items(): if a.attr('src') and re.match(r'.*youtube\.com.*', a.attr('src')): media_u = a.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media)
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h2'}) t = util.get_time_string_from_selectors(doc, {'div.dateforarticle'}) t_stamp = 0 if relative_time_pattern.match(t): t_stamp = self._get_timestamp_from_relative_time_str(t) elif absolute_time_pattern.match(t): t_stamp = util.get_timestamp_from_string(t) + int( time.localtime().tm_sec) category = '新聞' author = '' content = util.get_paragraphs_from_selector(doc, '#mymain') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'AM730' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'.PlaylistRow kanhanpass'}) t = util.get_time_string_from_selectors(doc, {'.PlaylistRow td'}, date_patterns={date_pattern}) t_stamp = util.get_timestamp_from_string(t, '%Y/%m/%d %I:%M %p') + int( time.localtime().tm_sec) category = doc('#topMenu a.on').text() author = '' c_id = re.findall(url_id_pattern, task.url)[0] r = requests.get( 'http://webcast.legco.gov.hk/Public_uat_embedded/Service1.asmx/GetTimeMarker?meetingID=' + c_id + '&lang=tc') j_obj = json.loads(r.text) content = '' for agenda in j_obj['TimeMarkerItems']: a_time = agenda['AgendaTime'] a_item = agenda['AgendaItem'] content = content + a_time + u' - ' + a_item + u'\n' item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'LegislativeCouncil' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1.conttit'}) t = util.get_time_string_from_selectors(doc, {'div.pubtime'}) t_stamp = util.get_timestamp_from_string(t) category = 'hk' author = '' content = util.get_paragraphs_from_selector(doc, 'div.contentbox p') content = re.sub(r'READMORE\: .+\n', '', content) item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'ChinaDaily' item.task_no = self.BATCH_NUMBER for img in doc('div.contentbox img').items(): if img.attr('src') != '': media_u = 'http://www.chinadailyasia.com/' + re.sub(r'.+(?=attachement)', '', img.attr('src')) des = '' if img.attr('alt'): des = img.attr('alt') elif img.siblings('p'): des = img.siblings('p').text() media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media)
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, u'881903.com 商業電台 - ') t = util.get_time_string_from_selectors( doc, {'#divnewsTextDate', '#part6808_ctl00_lblDetailDate'}) t_stamp = util.get_timestamp_from_string(t, '%d.%m.%Y %H:%M') + int( time.localtime().tm_sec) category = doc('#part8425_ctl00_divtitle').text() author = '' content = util.get_paragraphs_from_selector(doc, '#divnewsTextContent p') if content == '': content = util.get_paragraphs_from_selector(doc, '#tdContent p') if content == '': content = util.get_paragraphs_from_selector( doc, '.newsTextContent2') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'CommercialRadio' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1 a'}) t = util.get_time_string_from_selectors(doc, {'span.postdate'}) t_stamp = util.get_timestamp_from_string(t) + int( time.localtime().tm_sec) category = doc('span.postcat a').text() author = doc('span.postauthor a').text() content = util.get_paragraphs_from_selector(doc, 'div p') content = re.sub(ur'投稿:[.\n\r\t]*.*', u'', content, re.M | re.U | re.I) content = re.sub(ur'則留言[.\n\r\t]*', u'', content, re.M | re.U | re.I) content = re.sub(ur'大道之行也,天下為公,選賢與能,講信修睦。---《禮運.大同》[.\n\r\t]*', u'', content, re.M | re.U | re.I) item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'VJMedia' item.task_no = self.BATCH_NUMBER for img in doc( '#container img.size-full, #container img.size-large').items(): if img.attr('src') != '': des = '' if img.attr('alt'): des = img.attr('alt') elif img.siblings('p'): des = img.siblings('p').text() media = self.NewsItem.MediaItem(media_url=img.attr('src'), type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for iframe in doc('iframe').items(): if iframe.attr('src') and re.match(r'.*youtube\.com.+', iframe.attr('src')): media = self.NewsItem.MediaItem(media_url=iframe.attr('src'), type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media) if util.within_active_interval(6, 1200): _comments = util.get_filtered_facebook_comments_data( '214585295294555', doc('div.fb-comments').attr('data-href'), task.url) if _comments: for _comment in _comments: item.media_list.append( self.NewsItem.MediaItem( media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at))
def normal_item_solver(self, item, task, response): response.encoding = 'utf-8' doc = self.get_doc(response) title = doc('#page-h1').text() t = util.get_time_string_from_selectors(doc, {'h5 small'}, {date_pattern}) t_stamp = util.get_timestamp_from_string( t) + time.localtime().tm_hour * 3600 + time.localtime( ).tm_min * 60 + time.localtime().tm_sec category = doc('h5 small a').text() author = '' content = util.get_paragraphs_from_selector(doc, '.content-show p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'Bauhinia' item.task_no = self.BATCH_NUMBER
def page_filter(self, doc, url): for reg_pattern in self.reg_patterns: if reg_pattern.match(url): t = util.get_time_string_from_selectors(doc, {'span.time'}) t_stamp = util.get_timestamp_from_string(t) if t_stamp >= util.get_day_stamp(self.OFFSET): return True return False return False
def task_filter(self, doc, url, doc_url): for reg_pattern in self.reg_patterns: if reg_pattern.match(url): if not reg_pattern.match(doc_url): return True t = util.get_time_string_from_selectors(doc, {'#article_date'}) t_stamp = util.get_timestamp_from_string(t) if t_stamp >= util.get_day_stamp(self.OFFSET): return True return False
def page_filter(self, doc, url): wanted = False for reg_pattern in self.reg_patterns: if reg_pattern.match(url): if doc('div.post_time'): t = util.get_time_string_from_selectors(doc, {'div.post_time'}) t_stamp = util.get_timestamp_from_string(t) if t_stamp >= util.get_day_stamp(): return True return wanted
def page_filter(self, doc, url): for reg_pattern in self.reg_patterns: if reg_pattern.match(url): t = util.get_time_string_from_selectors( doc, {'#divnewsTextDate', '#part6808_ctl00_lblDetailDate'}) t_stamp = util.get_timestamp_from_string(t, '%d.%m.%Y %H:%M') if t_stamp >= util.get_day_stamp(self.OFFSET): return True return False return False
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, ur' - .*') t = '' t_stamp = 0 if doc('time') or doc('span.date'): t = util.get_time_string_from_selectors(doc, {'time', 'span.date'}) t_stamp = util.get_timestamp_from_string(t) + int(time.localtime().tm_sec) category = re.sub(ur'.*\s+', u'', doc('.dropdown-menu li.active a').text()) author = '' content = util.get_paragraphs_from_selector(doc, '#news-content') if content == '': content = util.get_paragraphs_from_selector(doc, '.content span') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'HeadlineNews' item.task_no = self.BATCH_NUMBER for img in doc('.content .item img').items(): if img.attr('src') != '': media_u = img.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u des = '' if img.attr('alt'): des = img.attr('alt') media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for a in doc('iframe').items(): if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')): media_u = a.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media) if util.within_active_interval(12, 600): _comments = util.get_filtered_facebook_comments_data('978368502211772', doc('div.fb-comments').attr('data-href'), task.url) if _comments: for _comment in _comments: item.media_list.append( self.NewsItem.MediaItem(media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at) )
def page_filter(self, doc, url): for reg_pattern in self.reg_patterns: if reg_pattern.match(url): t = util.get_time_string_from_selectors( doc, {'.PlaylistRow td'}, date_patterns={date_pattern}) t_stamp = util.get_timestamp_from_string( t, '%Y/%m/%d %I:%M %p') if t_stamp >= util.get_day_stamp(self.OFFSET): return True return False return False
def page_filter(self, doc, url): for reg_pattern in self.reg_patterns: if reg_pattern.match(url): if doc('td'): t = util.get_time_string_from_selectors(doc, {'td'}, {date_pattern}) t_stamp = util.get_timestamp_from_string(t, '%Y-%m-%d %H:%M:%S') if t_stamp >= util.get_month_day_timestamp(self.OFFSET): return True return False return False return False
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) item.raw = doc.text() item.title = util.get_filtered_title(doc, {'h1'}) item.t = util.get_time_string_from_selectors(doc, {'h6'}) item.t_stamp = 0 if re.findall(ur'上午', item.t): item.t_stamp = util.get_timestamp_from_string( re.sub(ur'上午', u'AM', item.t), u'%Y年%m月%d日 %p%I:%M')
def page_filter(self, doc, url): wanted = False for reg_pattern in self.reg_patterns: if reg_pattern.match(url): if doc('div.video_date span'): t = util.get_time_string_from_selectors( doc, {'div.video_date span'}, date_patterns={date_pattern}) t_stamp = util.get_timestamp_from_string(t) if t_stamp >= util.get_day_stamp(self.OFFSET): return True return wanted
def page_filter(self, doc, url): wanted = False for reg_pattern in self.reg_patterns: if reg_pattern.match(url): if date_pattern.findall(doc('span.pull-right').text()): t = util.get_time_string_from_selectors( doc, {'span.pull-right'}, date_patterns={date_pattern}) t_stamp = util.get_timestamp_from_string( t, time_format=u'%Y 年 %m 月 %d 日') if t_stamp >= util.get_day_stamp(self.OFFSET): return True return wanted
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, ur' \| .*') t = util.get_time_string_from_selectors(doc, {'p.dateFormat'}) t_stamp = util.get_timestamp_from_string(t) + int( time.localtime().tm_sec) category = '' if re.findall(ur'(?<=hongkong/)\d+-.+?(?=/\d+.*)', task.url): cat_part = re.findall(ur'(?<=hongkong/)\d+-.+?(?=/\d+.*)', task.url)[0] category = re.findall(ur'(?<=-).*', cat_part)[0]
def page_filter(self, doc, url): for reg_pattern in self.reg_patterns: if reg_pattern.match(url): t = util.get_time_string_from_selectors( doc, {'div.dateforarticle'}) t_stamp = 0 if relative_time_pattern.match(t): t_stamp = self._get_timestamp_from_relative_time_str(t) elif absolute_time_pattern.match(t): t_stamp = util.get_timestamp_from_string(t) if t_stamp >= util.get_day_stamp(self.OFFSET): return True return False return False
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, u'中國評論新聞:') t = util.get_time_string_from_selectors(doc, {'td'}, {date_pattern}) t_stamp = util.get_timestamp_from_string(t, '%Y-%m-%d %H:%M:%S') category = '' scripts = doc('script').text() if re.findall(ur'coluid=\d+', task.url): col_id_str = re.findall(ur'coluid=\d+', task.url)[0] cat_block = re.findall(r'<a[^<>]*?' + col_id_str + '.+?</a>', scripts)[-1] cat_doc = pq(cat_block) category = cat_doc.text()
def page_filter(self, doc, url): wanted = False for reg_pattern in self.reg_patterns: if reg_pattern.match(url): if doc('h6'): t = util.get_time_string_from_selectors(doc, {'h6'}) t_stamp = 0 if re.findall(ur'上午', t): t_stamp = util.get_timestamp_from_string( re.sub(ur'上午', u'AM', t), u'%Y年%m月%d日 %p%I:%M') elif re.findall(ur'下午', t): t_stamp = util.get_timestamp_from_string( re.sub(ur'下午', u'PM', t), u'%Y年%m月%d日 %p%I:%M') if t_stamp >= util.get_day_stamp(offset=self.OFFSET): return True
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) item.raw = doc.text() item.title = util.get_filtered_title(doc, {'h1'}) item.t = util.get_time_string_from_selectors(doc, {'div.post_time'}) item.t_stamp = util.get_now() item.fetched_at = task.feteched_at item.category = doc('div.post_cats a:last-child').text() item.author = doc('.single_author a').text() item.content = util.get_paragraphs_from_selector(doc, 'div.single_text p') item.url = task.url item.source = 'DMHK' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, ur'\|.*') t = util.get_time_string_from_selectors(doc, {'span.posted-time'}) t_stamp = util.get_timestamp_from_string( t) + time.localtime().tm_hour * 3600 + time.localtime( ).tm_min * 60 + time.localtime().tm_sec category = doc('span.channel-section').text() author = '' content = util.get_paragraphs_from_selector(doc, 'div.article-content p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.redirected_url item.source = 'Initium' item.task_no = self.BATCH_NUMBER for img in doc('.main-content .image img').items(): if img.attr('src') != '': media_u = img.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u des = '' if img.attr('alt'): des = img.attr('alt') media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for a in doc('iframe').items(): if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')): media_u = a.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media)
def task_filter(self, doc, url, doc_url): for reg_pattern in self.reg_patterns: if reg_pattern.match(url): if year_date_pattern.findall(url): t = year_date_pattern.findall(url)[0] t_stamp = util.get_timestamp_from_string(t) else: if doc('time') or doc('span.date'): t = util.get_time_string_from_selectors(doc, {'time', 'span.date'}) t_stamp = util.get_timestamp_from_string(t) else: return True if t_stamp >= util.get_day_stamp(self.OFFSET): return True return False return False
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, ur' - RTHK') t = util.get_time_string_from_selectors(doc, {'div.createddate'}) t_stamp = util.get_timestamp_from_string( t, '%Y-%m-%d HKT %H:%M') + time.localtime().tm_sec category = '' if cat_pattern.findall(doc('script').text()): category = cat_pattern.findall(doc('script').text())[-1] author = '' content = util.get_paragraphs_from_selector(doc, 'div.itemFullText') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'RTHK' item.task_no = self.BATCH_NUMBER for img in doc('img.imgPhotoAfterLoad').items(): if img.attr('src') != '': media_u = img.attr('src') des = '' if img.attr('alt'): des = img.attr('alt') media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for a in doc('iframe').items(): if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')): media_u = a.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media)
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1'}) t = util.get_time_string_from_selectors(doc, {'time.timestamp'}) t_stamp = util.get_timestamp_from_string(t) cat = doc('body').attr('class') category = '' if doc('.article-breadCrumb a'): category = doc('.article-breadCrumb a').text() author = doc('.author span').text() content = util.get_paragraphs_from_selector(doc, '#A p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'WSJ' item.task_no = self.BATCH_NUMBER for img in doc('#A .media-object-image img').items(): if img.attr('src') != '': media_u = img.attr('src') des = '' if img.attr('alt'): des = img.attr('alt') media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for a in doc('iframe').items(): if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')): media_u = a.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media)
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) item.raw = doc.text() item.title = util.get_filtered_title(doc, {'title'}, u'東周網【東周刊官方網站】| - .+') item.t = util.get_time_string_from_selectors( doc, {'span.pull-right'}, date_patterns={date_pattern}) item.t_stamp = util.get_timestamp_from_string( item.t, time_format=u'%Y 年 %m 月 %d 日') + time.localtime( ).tm_hour * 3600 + time.localtime().tm_min * 60 + time.localtime( ).tm_sec item.fetched_at = task.fetched_at item.category = re.sub(ur'.*\s+', u'', doc('.default-group a').text()) item.author = '' item.content = util.get_paragraphs_from_selector( doc, 'div.view-content p') item.url = task.url item.source = 'EastWeek' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) t = util.get_time_string_from_selectors(doc, {'span.time'}) t_stamp = util.get_timestamp_from_string(t) + int( time.localtime().tm_sec) title = doc('h4').remove('span').text() category = doc('#topMenu a.on').text() author = '' content = util.get_paragraphs_from_selector(doc, '#c1_afterplayer pre') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'TVB' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): response.encoding = 'utf-8' doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}) t = util.get_time_string_from_selectors(doc, {'.tm'}, date_patterns) t_stamp = util.get_timestamp_from_string(t) category = (doc('h2 b').text())[:-2] author = doc('.fdr span').text() content = util.get_paragraphs_from_selector(doc, '.wz_nr p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'HKCNA' item.task_no = self.BATCH_NUMBER
def get_auto_configured_spider(cls, offset=0): legco_seed = {'http://webcast.legco.gov.hk/public/zh-hk/SearchResult'} r = requests.get( 'http://webcast.legco.gov.hk/public/zh-hk/SearchResult') headers = r.headers headers[ 'Referer'] = 'http://webcast.legco.gov.hk/public/zh-hk/SearchResult' _page = 1 while True: r = requests.get( 'http://webcast.legco.gov.hk/public/zh-hk/SearchResult?page=' + str(_page), headers=headers) d = pq(r.text) t = util.get_time_string_from_selectors( d, {'tr.PlaylistRow td'}, date_patterns={date_pattern}) t_stamp = util.get_timestamp_from_string(t, '%Y/%m/%d %I:%M %p') if t_stamp >= util.get_day_stamp(offset): _page += 1 for entry in d('tr.PlaylistRow').items(): if re.findall(entry_id_pattern, entry.attr('id')): c_id = re.findall(entry_id_pattern, entry.attr('id'))[0] legco_seed.add( 'http://webcast.legco.gov.hk/public/zh-hk/SearchResult?MeetingID=' + c_id) else: break spider_legco = SpiderLegco( 'SpiderLegco', legco_seed, { ur'http://webcast\.legco\.gov\.hk/public/zh-hk/SearchResult\?MeetingID=.+' }, THREAD_NUM=5) spider_legco.OFFSET = offset spider_legco.BATCH_NUMBER = util.get_day_stamp() + 10300 return spider_legco