def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, u'881903.com 商業電台 - ') t = util.get_time_string_from_selectors( doc, {'#divnewsTextDate', '#part6808_ctl00_lblDetailDate'}) t_stamp = util.get_timestamp_from_string(t, '%d.%m.%Y %H:%M') + int( time.localtime().tm_sec) category = doc('#part8425_ctl00_divtitle').text() author = '' content = util.get_paragraphs_from_selector(doc, '#divnewsTextContent p') if content == '': content = util.get_paragraphs_from_selector(doc, '#tdContent p') if content == '': content = util.get_paragraphs_from_selector( doc, '.newsTextContent2') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'CommercialRadio' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h2'}) t = util.get_time_string_from_selectors(doc, {'div.dateforarticle'}) t_stamp = 0 if relative_time_pattern.match(t): t_stamp = self._get_timestamp_from_relative_time_str(t) elif absolute_time_pattern.match(t): t_stamp = util.get_timestamp_from_string(t) + int( time.localtime().tm_sec) category = '新聞' author = '' content = util.get_paragraphs_from_selector(doc, '#mymain') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'AM730' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'.article-title'}) t = '' with self.url_time_dict_lock: t = self.url_time_dict[task.url] print t t_stamp = util.get_timestamp_from_string(t) + time.localtime().tm_hour*3600 + time.localtime().tm_min*60 + time.localtime().tm_sec category = '新聞發佈' author = '' content = util.get_paragraphs_from_selector(doc, '.article-content') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'NewCenturyForum' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): response.encoding = 'utf-8' doc = self.get_doc(response) title = util.get_filtered_title(doc, {'.article-header h1'}) t = doc('.date').text() if date_pattern.findall(t): t = date_pattern.findall(t)[0] t_stamp = util.get_timestamp_from_string(t) category = doc('.now-here').text() author = '' content = util.get_paragraphs_from_selector(doc, '.article p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'JD Online' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1.entry-title'}) t = doc('meta[property="article:published_time"]').attr('content') t_stamp = 0 if t: t_stamp = util.get_timestamp_from_string(t) category = '' cat_find_res = cat_pattern.findall(task.url) if cat_find_res: category = cat_find_res[0] category = u'科技/' + category author = '' content = util.get_paragraphs_from_selector(doc, '#content div.entry p:not(.meta)') content = re.sub(ur'(來源:|來源:|Tags:).+', u'', content) item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'Unwire' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1'}) t = doc('meta[property="article:published_time"]').attr('content') t_stamp = 0 if t: t_stamp = util.get_timestamp_from_string(t) category = '' author = '' content = util.get_paragraphs_from_selector(doc, '#main-content .entry-content p') content = re.sub(ur'繼續閱讀[\n\s\S.]*', '', content) item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'UBeat' item.task_no = self.BATCH_NUMBER for img in doc('figure.entry-thumbnail img').items(): if img.attr('src') != '': media_u = img.attr('src') des = '' if img.attr('alt'): des = img.attr('alt') media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media)
def normal_item_solver(self, item, task, response): response.encoding = 'utf-8' doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1'}) t_stamp = 0 t_stamp_url = doc('meta[property="og:image:url"]').attr('content') if t_stamp_url: f_res = url_stamp_pattern.findall(t_stamp_url) if f_res: t_stamp = int(f_res[0]) t = '' if t_stamp: t = time.ctime(t_stamp) else: t = doc('ul.blog-info i.fa-calendar').parent('li').text() t_stamp = self.get_stamp_from_relative_timestr(t) category = doc('ul.blog-info i.fa-tags').siblings('a').text() author = '' content = util.get_paragraphs_from_selector(doc, 'div.blog-content') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'HKSilicon' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'.PlaylistRow kanhanpass'}) t = util.get_time_string_from_selectors(doc, {'.PlaylistRow td'}, date_patterns={date_pattern}) t_stamp = util.get_timestamp_from_string(t, '%Y/%m/%d %I:%M %p') + int( time.localtime().tm_sec) category = doc('#topMenu a.on').text() author = '' c_id = re.findall(url_id_pattern, task.url)[0] r = requests.get( 'http://webcast.legco.gov.hk/Public_uat_embedded/Service1.asmx/GetTimeMarker?meetingID=' + c_id + '&lang=tc') j_obj = json.loads(r.text) content = '' for agenda in j_obj['TimeMarkerItems']: a_time = agenda['AgendaTime'] a_item = agenda['AgendaItem'] content = content + a_time + u' - ' + a_item + u'\n' item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'LegislativeCouncil' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1 a'}) t = util.get_time_string_from_selectors(doc, {'span.postdate'}) t_stamp = util.get_timestamp_from_string(t) + int( time.localtime().tm_sec) category = doc('span.postcat a').text() author = doc('span.postauthor a').text() content = util.get_paragraphs_from_selector(doc, 'div p') content = re.sub(ur'投稿:[.\n\r\t]*.*', u'', content, re.M | re.U | re.I) content = re.sub(ur'則留言[.\n\r\t]*', u'', content, re.M | re.U | re.I) content = re.sub(ur'大道之行也,天下為公,選賢與能,講信修睦。---《禮運.大同》[.\n\r\t]*', u'', content, re.M | re.U | re.I) item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'VJMedia' item.task_no = self.BATCH_NUMBER for img in doc( '#container img.size-full, #container img.size-large').items(): if img.attr('src') != '': des = '' if img.attr('alt'): des = img.attr('alt') elif img.siblings('p'): des = img.siblings('p').text() media = self.NewsItem.MediaItem(media_url=img.attr('src'), type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for iframe in doc('iframe').items(): if iframe.attr('src') and re.match(r'.*youtube\.com.+', iframe.attr('src')): media = self.NewsItem.MediaItem(media_url=iframe.attr('src'), type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media) if util.within_active_interval(6, 1200): _comments = util.get_filtered_facebook_comments_data( '214585295294555', doc('div.fb-comments').attr('data-href'), task.url) if _comments: for _comment in _comments: item.media_list.append( self.NewsItem.MediaItem( media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at))
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1.conttit'}) t = util.get_time_string_from_selectors(doc, {'div.pubtime'}) t_stamp = util.get_timestamp_from_string(t) category = 'hk' author = '' content = util.get_paragraphs_from_selector(doc, 'div.contentbox p') content = re.sub(r'READMORE\: .+\n', '', content) item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'ChinaDaily' item.task_no = self.BATCH_NUMBER for img in doc('div.contentbox img').items(): if img.attr('src') != '': media_u = 'http://www.chinadailyasia.com/' + re.sub(r'.+(?=attachement)', '', img.attr('src')) des = '' if img.attr('alt'): des = img.attr('alt') elif img.siblings('p'): des = img.siblings('p').text() media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media)
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1'}) t_divs = doc('h1').siblings('div').items() t = '' t_stamp = 0 for _div in t_divs: if _div.css('color') == 'rgb(128, 128, 128)': t = _div.text() t_stamp = util.get_timestamp_from_string(t) + int( time.localtime().tm_sec) break category = u'电子' author = '' content = util.get_paragraphs_from_selector(doc, '.href_txt_blog2') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'HKGolden' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) item.raw = doc.text() item.title = util.get_filtered_title(doc, {'h1'}) item.t = util.get_time_string_from_selectors(doc, {'h6'}) item.t_stamp = 0 if re.findall(ur'上午', item.t): item.t_stamp = util.get_timestamp_from_string( re.sub(ur'上午', u'AM', item.t), u'%Y年%m月%d日 %p%I:%M')
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1'}) # pl = doc('.heading .pull-left') # pl.remove('span') t = doc('meta[property="article:published_time"]').attr('content') if t: t_stamp = util.get_timestamp_from_string(t) else: t_stamp = 0 category = doc('.post-lead-category').text() author = doc('.author.vcard').text() content = util.get_paragraphs_from_selector(doc, 'section.entry p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'HouseNewsBlogger' item.task_no = self.BATCH_NUMBER for img in doc('section.entry img').items(): if img.attr('src') != '': media_u = img.attr('src') des = '' if img.attr('alt'): des = img.attr('alt') elif img.siblings('p'): des = img.siblings('p').text() media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for a in doc('iframe').items(): if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')): media_u = a.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media)
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, u'中國評論新聞:') t = util.get_time_string_from_selectors(doc, {'td'}, {date_pattern}) t_stamp = util.get_timestamp_from_string(t, '%Y-%m-%d %H:%M:%S') category = '' scripts = doc('script').text() if re.findall(ur'coluid=\d+', task.url): col_id_str = re.findall(ur'coluid=\d+', task.url)[0] cat_block = re.findall(r'<a[^<>]*?' + col_id_str + '.+?</a>', scripts)[-1] cat_doc = pq(cat_block) category = cat_doc.text()
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1'}) t = doc('span.datestamp').text() t_stamp = util.get_timestamp_from_string(t) category = '' category = doc('#breadcrumbs a:last-child').text() author = '' author = doc('.contributor a').text() content = util.get_paragraphs_from_selector( doc, '.article-content-container p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'ZaoBao' item.task_no = self.BATCH_NUMBER for img in doc('.loadme picture').items(): if img('source') and img('source').attr('data-srcset'): media_u = img('source:first-child').attr('data-srcset') des = '' if img.attr('title'): des = img.attr('title') elif img('img') and img('img').attr('title'): des = img('img').attr('title') media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) # for a in doc('iframe').items(): # if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')): # media_u = a.attr('src') # if re.match(r'//.+', media_u): # media_u = 'http:' + media_u # media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube', # created_at=item.fetched_at) # item.media_list.append(media) if news_id_pattern.findall(task.url): item.id = news_id_pattern.findall(task.url)[0] else: item.id = ''
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) item.raw = doc.text() item.title = util.get_filtered_title(doc, {'h1'}) item.t = util.get_time_string_from_selectors(doc, {'div.post_time'}) item.t_stamp = util.get_now() item.fetched_at = task.feteched_at item.category = doc('div.post_cats a:last-child').text() item.author = doc('.single_author a').text() item.content = util.get_paragraphs_from_selector(doc, 'div.single_text p') item.url = task.url item.source = 'DMHK' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title( doc, {'h1', 'font.heading', 'font[size="+2"]'}) t = util.get_day_string(offset=self.OFFSET) t_stamp = util.get_day_stamp(self.OFFSET) category = '' if cat_pattern.findall(task.url): cat_word = cat_pattern.findall(task.url)[0] category = doc('.' + cat_word).text() if category == '': category = re.sub(ur' .*', u'', doc('td font').text()) author = '' content = util.get_paragraphs_from_selector(doc, '.newsText p') if content == '': content = util.get_paragraphs_from_selector(doc, '#contentAD1 p') if content == '': _doc = doc('#contentAD1') _doc.remove('table') _doc.remove('span') content = util.get_paragraphs_from_selector(_doc, 'div') if content == '': content = util.get_paragraphs_from_selector(doc, 'dd') if content == '': content = doc('.caption').next_all('p').text() if content == '': _doc = doc.parent('.caption').parent() _doc.remove('table').remove('span') content = _doc.text() if content == '': _doc = doc('.caption').parent() content = _doc.remove('table').text() if content == '': content = doc('.summaryPara').text() item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'SunDaily' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, u' - The News Lens.*') t = doc('meta[property="article:published_time"]').attr('content') t_stamp = util.get_timestamp_from_string(t) + 8 * 3600 category = doc('meta[property="article:section"]').attr('content') author = doc('meta[name="author"]').attr('content') content = util.get_paragraphs_from_selector( doc, 'div.article-body-container div.article-content p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'NewsLens' item.task_no = self.BATCH_NUMBER for img in doc( 'div.article-header-container img.front-img, div.article-content img' ).items(): if img and img.attr('src') != '': media_u = img.attr('src-lg') des = '' if img.attr('alt'): des = img.attr('alt') media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for a in doc('iframe').items(): if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')): media_u = a.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media)
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1'}) t = util.get_time_string_from_selectors(doc, {'time.timestamp'}) t_stamp = util.get_timestamp_from_string(t) cat = doc('body').attr('class') category = '' if doc('.article-breadCrumb a'): category = doc('.article-breadCrumb a').text() author = doc('.author span').text() content = util.get_paragraphs_from_selector(doc, '#A p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'WSJ' item.task_no = self.BATCH_NUMBER for img in doc('#A .media-object-image img').items(): if img.attr('src') != '': media_u = img.attr('src') des = '' if img.attr('alt'): des = img.attr('alt') media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for a in doc('iframe').items(): if a.attr('src') and re.match(r'.*youtube\.com.+', a.attr('src')): media_u = a.attr('src') if re.match(r'//.+', media_u): media_u = 'http:' + media_u media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description='youtube', created_at=item.fetched_at) item.media_list.append(media)
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1'}) t = doc('div[itemprop="dateCreated"]').attr('datetime') t_stamp = util.get_timestamp_from_string(t) + 8 * 3600 category = doc('.pane-content .lineage-item:last-child').text() author = doc('.scmp-v2-author-name').text() content = util.get_paragraphs_from_selector(doc, '.pane-content p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'SCMP' item.task_no = self.BATCH_NUMBER for img in doc('.scmp-gallery-swiper img').items(): if img.attr('data-original') != '': media_u = img.attr('data-original') des = '' if img.attr('data-caption'): des = img.attr('data-caption') media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media) for a in doc('iframe').items(): if a.attr('data-original') and re.match(r'.*youtube\.com.+', a.attr('data-original')): media_u = a.attr('data-original') des = '' if a.children('title'): des = a.children('title').text() media = self.NewsItem.MediaItem(media_url=media_u, type='youtube', description=des, created_at=item.fetched_at) item.media_list.append(media)
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1'}) t = doc('meta[property="article:published_time"]').attr('content') if t: t_stamp = util.get_timestamp_from_string(t) + 8 * 3600 else: t_stamp = 0 cats = doc('#main .entry-header .meta-category a').items() category = '' for c in cats: category += c.text() + ', ' category = category[:-2] author = doc('#main .meta-item.author').text() content = util.get_paragraphs_from_selector( doc, 'div.entry-content p:not(.wp-caption-text)') content = re.sub(r'Comments$', '', content) item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'HKFP' item.task_no = self.BATCH_NUMBER for img in doc('div.entry-content img').items(): if img.attr('src') != '': media_u = img.attr('src') des = '' if img.attr('alt'): des = img.attr('alt') elif img.siblings('p'): des = img.siblings('p').text() media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media)
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, u' - BBC 中文网| BBC Zhongwen') t = '' t_stamp = 0 if doc('.story-body .mini-info-list .date').attr('data-datetime'): t = doc('.story-body .mini-info-list .date').attr('data-datetime') t_stamp = util.get_timestamp_from_string( t) + time.localtime().tm_sec elif doc('.timeline-status h3') and date_pattern.findall( doc('.timeline-status h3').text()): t = date_pattern.findall(doc('.timeline-status h3').text())[0] t_stamp = util.get_timestamp_from_string( t) + time.localtime().tm_sec elif doc('.story-body .date strong') and date_pattern.findall( doc('.story-body .date strong').text()): t = date_pattern.findall(doc('.story-body .date strong').text())[0] t_stamp = util.get_timestamp_from_string( t) + time.localtime().tm_sec category = doc('meta[property="article:section"]').attr('content') author = doc('span.byline__name').text() content = util.get_paragraphs_from_selector( doc, 'div[property="articleBody"] p') if content == '': content = util.get_paragraphs_from_selector( doc, '.article-wrapper p') if content == '': content = util.get_paragraphs_from_selector(doc, '.map-body p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'BBC Chinese' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1'}, r' The Standard$') pl = doc('.heading .pull-left') pl.remove('span') t = pl.text().split(' | ')[1] if t: t_stamp = util.get_timestamp_from_string(t) else: t_stamp = 0 category = pl.text().split(' | ')[0] author = '' content = util.get_paragraphs_from_selector(doc, 'div.content p') if content == '': content = util.get_paragraphs_from_selector(doc, 'div.content') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'Standard' item.task_no = self.BATCH_NUMBER for img in doc('div.content img').items(): if img.attr('src') != '': media_u = img.attr('src') des = '' if img.attr('alt'): des = img.attr('alt') elif img.siblings('p'): des = img.siblings('p').text() media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media)
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) item.raw = doc.text() item.title = util.get_filtered_title(doc, {'title'}, u'東周網【東周刊官方網站】| - .+') item.t = util.get_time_string_from_selectors( doc, {'span.pull-right'}, date_patterns={date_pattern}) item.t_stamp = util.get_timestamp_from_string( item.t, time_format=u'%Y 年 %m 月 %d 日') + time.localtime( ).tm_hour * 3600 + time.localtime().tm_min * 60 + time.localtime( ).tm_sec item.fetched_at = task.fetched_at item.category = re.sub(ur'.*\s+', u'', doc('.default-group a').text()) item.author = '' item.content = util.get_paragraphs_from_selector( doc, 'div.view-content p') item.url = task.url item.source = 'EastWeek' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1'}) t = doc('meta[property="article:published_time"]').attr('content') if t: t_stamp = util.get_timestamp_from_string(t) else: t_stamp = 0 category = doc('.single-meta div:nth-child(3) a').text() author = doc('.single-meta span.author.vcard').text() if len(author) > 20: author = '' content = util.get_paragraphs_from_selector(doc, 'div.resize p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'TMHK' item.task_no = self.BATCH_NUMBER for img in doc('div.resize img').items(): if img.attr('src') != '': media_u = img.attr('src') des = '' if img.attr('alt'): des = img.attr('alt') elif img.siblings('p'): des = img.siblings('p').text() media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media)
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1'}) t = doc('meta[property="article:published_time"]').attr('content') if t: t_stamp = util.get_timestamp_from_string(t) + 8 * 3600 else: t_stamp = 0 category = doc('#pageHeadNav li.selected').text() author = doc('.story-top a[rel="author"]').text() content = util.get_paragraphs_from_selector(doc, '#article_body p') content = re.sub(r'Follow CNBC International on.+', '', content) item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'CNBC' item.task_no = self.BATCH_NUMBER for img in doc('#article_body img').items(): if img.attr('src') != '': media_u = img.attr('src') des = '' if img.attr('alt'): des = img.attr('alt') elif img.siblings('p'): des = img.siblings('p').text() media = self.NewsItem.MediaItem(media_url=media_u, type='image', description=des, created_at=item.fetched_at) item.media_list.append(media)
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1'}) t = doc('meta[property="og:article:published_time"]').attr('content') t_stamp = util.get_timestamp_from_string(t) + 8 * 3600 category = doc('.article-section').text() author = '' content = util.get_paragraphs_from_selector(doc, '#article-text p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'Reuters CN' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'h1'}) t = doc('time.published').text() t_stamp = util.get_timestamp_from_string( doc('time.published').attr('datetime'), '%Y-%m-%d %H:%M:%S+0800') category = doc('#navBar li.active').text() if category == '' and cat_pattern_in_url.findall(task.url): category = cat_pattern_in_url.findall(task.url)[0] author = '' content = util.get_paragraphs_from_selector(doc, 'div.newsLeading p') if content == '': content = util.get_paragraphs_from_selector(doc, 'div.newsLeading') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'NowNews' item.task_no = self.BATCH_NUMBER if util.within_active_interval(6, 1200): _comments = util.get_filtered_facebook_comments_data( '515076798590105', doc('div.fb-comments').attr('data-href'), task.url) if _comments: for _comment in _comments: item.media_list.append( self.NewsItem.MediaItem( media_url=_comment['json_string'], type='comments', description='comments', created_at=item.fetched_at))
def normal_item_solver(self, item, task, response): doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}, u'港人講地 SPEAKOUT.HK - ') with self.url_time_dict_lock: t_stamp = self.url_time_dict[task.url] t = doc('.published').text() category = doc('.category-name').text() author = '' content = util.get_paragraphs_from_selector(doc, 'td p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'SpeakOut' item.task_no = self.BATCH_NUMBER
def normal_item_solver(self, item, task, response): response.encoding = 'utf-8' doc = self.get_doc(response) title = util.get_filtered_title(doc, {'title'}) t = util.get_time_string_from_selectors(doc, {'.tm'}, date_patterns) t_stamp = util.get_timestamp_from_string(t) category = (doc('h2 b').text())[:-2] author = doc('.fdr span').text() content = util.get_paragraphs_from_selector(doc, '.wz_nr p') item.raw = doc.text() item.title = title item.t = t item.t_stamp = t_stamp item.fetched_at = task.fetched_at item.category = category item.author = author item.content = content item.url = task.url item.source = 'HKCNA' item.task_no = self.BATCH_NUMBER