def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1 a'}) t = util.get_time_string_from_selectors(doc, {'span.postdate'}) t_stamp = util.get_timestamp_from_string(t) + int( time.localtime().tm_sec) category = doc('span.postcat a').text() author = doc('span.postauthor a').text() content = util.get_paragraphs_from_selector(doc, 'div p') content = re.sub(ur'投稿:[.\n\r\t]*.*', u'', content, re.M | re.U | re.I) content = re.sub(ur'則留言[.\n\r\t]*', u'', content, re.M | re.U | re.I) content = re.sub(ur'大道之行也,天下為公,選賢與能,講信修睦。---《禮運.大同》[.\n\r\t]*', u'', content, re.M | re.U | re.I) item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'VJMedia' item.task_no = self.BATCH_NUMBER for img in doc( '#container img.size-full, #container img.size-large').items(): if img.attr('src') != '': des = '' if img.attr('alt'): des = img.attr('alt') elif img.siblings('p'): des = img.siblings('p').text() media = self.NewsItem.MediaItem(media_url=img.attr('src'), type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for iframe in doc('iframe').items(): if iframe.attr('src') and re.match(r'.*youtube\.com.+', iframe.attr('src')): media = self.NewsItem.MediaItem(media_url=iframe.attr('src'), type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media) if util.within_active_interval(6, 1200): _comments = util.get_filtered_facebook_comments_data( '214585295294555', doc('div.fb-comments').attr('data-href'), task.url) if _comments: for _comment in _comments: item.media_list.append( self.NewsItem.MediaItem( media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at))
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, ur' - 香港新浪') t = doc('div.news-datetime').text() t_stamp = int( time.mktime(time.strptime(t.encode('utf-8'), '%Y年%m月%d日 %H:%M'))) scripts = doc('script').text() category = '' if re.findall(cat_pattern, scripts): category = re.findall(cat_pattern, scripts)[0] author = '' content = util.get_paragraphs_from_selector(doc, 'div.news-body p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'Sina' item.task_no = self.BATCH_NUMBER for img in doc('.news-body img').items(): if img.attr('src') != '': media_u = img.attr('src') if not ad_url_pattern.findall(media_u): des = '' if img.parent().attr('data-caption'): des = img.parent().attr('data-caption') media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for a in doc('iframe').items(): if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')): media_u = a.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media) if util.within_active_interval(6, 1200): _comments = util.get_filtered_facebook_comments_data( '114907575364430', doc('div.fb-comments').attr('data-href'), task.url) if _comments: for _comment in _comments: item.media_list.append( self.NewsItem.MediaItem( media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at))
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, ur'\s*\|.*') t = doc('div.article-content-wrap p.date').text() t_stamp = int( time.mktime(time.strptime(t.encode('utf-8'), '%Y/%m/%d — %H:%M'))) cat_href = cat_pattern.findall(task.url)[0] category = doc('ul[id=mainMenuUL] a[href="' + cat_href + '"]').text() author = '' content = util.get_paragraphs_from_selector(doc.remove('style'), 'div.article-content p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'StandNews' item.task_no = self.BATCH_NUMBER for img in doc('.article-content-wrap img').items(): if img.attr('src') != '' and not ad_image_pattern.match( img.attr('src')): des = '' if img.attr('alt'): des = img.attr('alt') media = self.NewsItem.MediaItem(media_url=img.attr('src'), type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for iframe in doc('.article-content-wrap iframe').items(): if iframe.attr('src') and youtube_pattern.match( iframe.attr('src')): media = self.NewsItem.MediaItem(media_url=iframe.attr('src'), type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media) if util.within_active_interval(6, 1200): _comments = util.get_filtered_facebook_comments_data( '1534089350179685', doc('div.fb-comments').attr('data-href'), task.url) if _comments: for _comment in _comments: item.media_list.append( self.NewsItem.MediaItem( media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at))
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, ur'熱血時報 \| ') author = doc('span.author a').text() t = doc('time[class="published"]').text() t_stamp = int(time.mktime(time.strptime(str(doc('time[class="published"]').text()), "%m-%d-%Y"))) category = doc('div.page-path a').text() doc.remove('script') doc.remove('style') content = util.get_paragraphs_from_selector(doc, 'div.article-body p') if content == '': content = util.get_paragraphs_from_selector(doc, 'div.article-body') if t_stamp >= int(time.mktime(time.strptime(str(time.localtime().tm_year) + str(time.localtime().tm_mon) + str( time.localtime().tm_mday), '%Y%m%d'))): t_stamp = int(time.time()) item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'PassionTimes' item.task_no = self.BATCH_NUMBER for img in doc('.article-body img').items(): if img.attr('src') != '': media_u = img.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u elif not re.match(r'http://.+', media_u): media_u = 'http://www.passiontimes.hk' + media_u media = self.NewsItem.MediaItem(media_url=media_u, type='image', description='', created_at=item.fetched_at) item.media_list.append(media) for a in doc('iframe').items(): if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')): media_u = a.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media) if util.within_active_interval(6, 1200): _comments = util.get_filtered_facebook_comments_data('462543587117177', doc('div.fb-comments').attr('data-href'), task.url) if _comments: for _comment in _comments: item.media_list.append( self.NewsItem.MediaItem(media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at) )
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) item.raw = doc.text() item.title = util.get_filtered_title(doc, {'.article_tit h1'}, ur'\s*|.*') if item.title == '': item.title = util.get_filtered_title(doc, {'title'}, ur'\s*|.*') item.t = doc('meta[name=artpdate]').attr('content') item.t_stamp = int(time.mktime(time.strptime(item.t, "%Y-%m-%d %H:%M:%S"))) item.fetched_at = task.fetched_at item.category = doc('meta[name=catname]').attr('content') item.author = doc('meta[name=author]').attr('content') content = util.get_paragraphs_from_selector(doc, 'div.article_content__module p') if content == '': content = util.get_paragraphs_from_selector(doc, 'li.article_summary_pt') item.content = content item.url = task.url item.source = 'HK01' item.task_no = self.BATCH_NUMBER for img in doc('.article__body__content img').items(): if img.attr('src') != '' and not img.parents('.related_article'): media_u = img.attr('src') if media_u != '//cdn.hk01.com/media/dummy/default_image.png': des = '' if img.attr('alt'): des = img.attr('alt') media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for img in doc('.article__body__content object[data-gallery-image="true"]').items(): if img.attr('data'): media_u = img.attr('data') des = '' if img.attr('alt'): des = img.attr('alt') media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for a in doc('iframe').items(): if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')): media_u = a.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media) if util.within_active_interval(6, 1200): _comments = util.get_filtered_facebook_comments_data('1651866545051541', doc('div.fb-comments').attr('data-href'), task.url) if _comments: for _comment in _comments: item.media_list.append( self.NewsItem.MediaItem(media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at) ) '''
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, ur' - .*') t = '' t_stamp = 0 if doc('time') or doc('span.date'): t = util.get_time_string_from_selectors(doc, {'time', 'span.date'}) t_stamp = util.get_timestamp_from_string(t) + int(time.localtime().tm_sec) category = re.sub(ur'.*\s+', u'', doc('.dropdown-menu li.active a').text()) author = '' content = util.get_paragraphs_from_selector(doc, '#news-content') if content == '': content = util.get_paragraphs_from_selector(doc, '.content span') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'HeadlineNews' item.task_no = self.BATCH_NUMBER for img in doc('.content .item img').items(): if img.attr('src') != '': media_u = img.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u des = '' if img.attr('alt'): des = img.attr('alt') media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for a in doc('iframe').items(): if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')): media_u = a.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media) if util.within_active_interval(12, 600): _comments = util.get_filtered_facebook_comments_data('978368502211772', doc('div.fb-comments').attr('data-href'), task.url) if _comments: for _comment in _comments: item.media_list.append( self.NewsItem.MediaItem(media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at) )
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1'}) t = doc('time.published').text() t_stamp = util.get_timestamp_from_string( doc('time.published').attr('datetime'), '%Y-%m-%d %H:%M:%S+0800') category = doc('#navBar li.active').text() if category == '' and cat_pattern_in_url.findall(task.url): category = cat_pattern_in_url.findall(task.url)[0] author = '' content = util.get_paragraphs_from_selector(doc, 'div.newsLeading p') if content == '': content = util.get_paragraphs_from_selector(doc, 'div.newsLeading') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'NowNews' item.task_no = self.BATCH_NUMBER if util.within_active_interval(6, 1200): _comments = util.get_filtered_facebook_comments_data( '515076798590105', doc('div.fb-comments').attr('data-href'), task.url) if _comments: for _comment in _comments: item.media_list.append( self.NewsItem.MediaItem( media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at))
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}) t = util.get_time_string_from_selectors(doc, {'div.video_date span'}, date_patterns={date_pattern}) t_stamp = util.get_timestamp_from_string(t) + int( time.localtime().tm_sec) category = doc('span.heretxt').text() author = '' content = util.get_paragraphs_from_selector(doc, 'div.video_content') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'CableNews' item.task_no = self.BATCH_NUMBER if util.within_active_interval(6, 1200): _comments = util.get_filtered_facebook_comments_data( '482092838576644', doc('div.fb-comments').attr('data-href'), task.url) if _comments: for _comment in _comments: item.media_list.append( self.NewsItem.MediaItem( media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at))
def normal_item_solver(self, item, task, response): response.encoding = 'utf-8' doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, u' \|.*') t = util.get_day_string(offset=self.OFFSET) t_stamp = util.get_timestamp_from_string( t) + time.localtime().tm_hour * 3600 + time.localtime().tm_min * 60 category = doc('meta[name=subsection]').attr('content') author = '' content = util.get_paragraphs_from_selector(doc, '#masterContent p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'AppleNews' item.task_no = self.BATCH_NUMBER if util.within_active_interval(6, 20 * 60): _comments = util.get_filtered_facebook_comments_data( '367495573302576', doc('meta[property="og:url"]').attr('content'), task.url) if _comments: for _comment in _comments: item.media_list.append( self.NewsItem.MediaItem( media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at))
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, ur'\s*\|.*') t = util.get_time_string_from_selectors( doc, {'div.PolDTextBox_Date', 'div.MainNews_Date'}) t = t[-4:] + t[3:5] + t[:2] t_stamp = util.get_timestamp_from_string( t) + time.localtime().tm_hour * 3600 + time.localtime( ).tm_min * 60 + time.localtime().tm_sec category = '' if cat_pattern.findall(task.url): category = cat_pattern.findall(task.url)[0] category = urllib2.unquote(category) author = '' content = util.get_paragraphs_from_selector(doc, 'div.PolDTextBox div') if content == '': content = util.get_paragraphs_from_selector( doc, 'div.MainNews_Text div') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'SkyPost' item.task_no = self.BATCH_NUMBER for img in doc('.PolDetailBox img').items(): if img.attr('src') != '': media_u = img.attr('src') des = '' if img.parent().siblings('.PolDCaption'): des = img.parent().siblings('.PolDCaption').text() elif img.parents('.NewsDPicBox02'): des = img.parents('.NewsDPicBox02').text() media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for a in doc('iframe').items(): if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')): media_u = a.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media) if util.within_active_interval(6, 1200): _comments = util.get_filtered_facebook_comments_data( '335749279848103', doc('div.fb-comments').attr('data-href'), task.url) if _comments: for _comment in _comments: item.media_list.append( self.NewsItem.MediaItem( media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at))
doc, '#articleContent p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'BastillePost' item.task_no = self.BATCH_NUMBER if util.within_active_interval(6, 1200): _comments = util.get_filtered_facebook_comments_data( '617988554913649', doc('#fbComments').attr('href'), task.url) if _comments: for _comment in _comments: item.media_list.append( self.NewsItem.MediaItem( media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at)) @classmethod def get_auto_configured_spider(cls, offset=0): bastille_seed = {'http://www.bastillepost.com/hongkong/'} r = requests.get('http://www.bastillepost.com/hongkong/') d = pq(r.text)