def parse_post(self, response): post = ItemLoader(item=ArabiaPostItem(), response=response) post.default_output_processor = TakeFirst() #post.add_xpath('id', '//*[@class="post_content replace_urls"]/@id', MapCompose(int), re=r'(\d+)') post.add_xpath('id', '//*[@class="short_url inputtext"]/@value', MapCompose(int), re=r'(\d+)') post.add_xpath('title', '//*[@id="nav_title"]/a/text()') post.add_xpath('up_votes', '//*[@class="s_upvotes"]/text()', MapCompose(int), re=r'(\d+)') post.add_xpath('down_votes', '//*[@class="s_downvotes"]/text()', MapCompose(int), re=r'(\d+)') post.add_xpath('points', '//*[@class="post_points ltr"]/text()', MapCompose(int)) post.add_xpath('author_username', '//*[@class="block username"]/text()') post.add_xpath('author_fullname', '//*[@class="block full_name"]/text()', MapCompose(lambda value: value.replace(u'\xa0', u''))) post.add_xpath('date', '//*[@class="icon-time"]/../text()') post.add_xpath('community', '//*[@class="icon-reorder"]/../a[1]/text()') post.add_xpath('topics', '//*[@class="topic"]/text()', MapCompose(string.strip)) post.add_xpath('url', '//*[@class="short_url inputtext"]/@value') post.add_value( 'type', 'link' if post.get_xpath('//*[@id="nav_title"]/a/@rel', TakeFirst()) == 'nofollow' else 'text') if post.get_output_value('type') == 'link': post.add_xpath('link', '//*[@id="nav_title"]/a/@href') post.add_xpath('domain', '//*[@class="post_domain"]/text()', re=r'\((.+?)\)') post.add_xpath('content', '//*[@class="post_content replace_urls"]/*', Join('\n')) post.add_value('item', 'post') yield post.load_item() comments = [] for row in response.selector.xpath( '//*[contains(@class, "post_comment")]'): comment = ItemLoader(item=ArabiaCommentItem(), selector=row, response=response) comment.default_output_processor = TakeFirst() comment.add_xpath('id', './@id', re=r'(\d+)') comment.add_xpath('index', './@class', MapCompose(int), re=r'index(\d+)') comment.add_value('post_id', post.get_output_value('id')) #comment.add_value('parent_id', '') comment.add_xpath('author_username', './/*[@class="comment_user"]/a/text()') comment.add_xpath('date', './/*[@class="comment_date"]/text()') comment.add_xpath('points', './/*[@class="comment_points ltr"]/text()') comment.add_xpath( 'content', './/*[@class="post_content comment_content replace_urls"]/*', Join('\n')) #comment.add_xpath('url', './/*[@class="comment_short_url"]/a/@href') comment.add_value( 'url', 'https://arabia.io/go/{0}/{1}'.format( post.get_output_value('id'), comment.get_output_value('id'))) comment.add_value('item', 'comment') comments.append(comment) for (index, comment) in enumerate(comments): if comment.get_output_value('index') == 0: comment.add_value('parent_id', 0) continue for comment_cursor in comments[:index][::-1]: if comment_cursor.get_output_value( 'index') == comment.get_output_value('index') - 1: comment.add_value('parent_id', comment_cursor.get_output_value('id')) break for comment in comments: yield comment.load_item()
def process_item(self, task_id): feed = self.db.getRssFeeds(task_id) self.log('processsing rss feed %s (%s)' % (feed['id'], feed['url']), log.INFO) # parse feed feed_data = feedparser.parse( feed['url'] ) self.log('reading %s feed items' % (len(feed_data['items'])), log.INFO) # update last read time self.db.updateRssFeedLastRead (task_id) # For each item in feed for item in feed_data['items']: # if this item has not already been processed if self.db.rssFeedItemExists(item['id']): self.log('%s - feed item already exists - skipping' % (item['id']), log.INFO) continue # store the full item l=ItemLoader (RssFeedItem()) l.add_value ('item_id', item['id']) l.add_value ('content', psycopg2.Binary(pickle.dumps(item))) #l.add_value ('content', pickle.dumps(item)) l.add_value ('feed_id', task_id) yield l.load_item() #feed_entry_id = uuid.uuid5(uuid.NAMESPACE_URL, str(item ['id'])) feed_entry_id = self.db.uuid5_str(name=str(item ['id'])) l=ItemLoader (FeedEntry()) l.add_value ('id', feed_entry_id) l.add_value ('title', item['title']) # l.add_value ('updated', format_datetime(item['updated_parsed'])) l.add_value ('incident_datetime', format_datetime(item['updated_parsed'])) if 'content' in item: for c in item['content']: l.add_value ('content', c['value']) elif 'summary' in item: l.add_value ('content', item['summary']) embedded_fields = self.extractContentFields (l.get_output_value('content')) pt = embedded_fields.get('location') or item.get('georss_point') # print "em: '%s' geo: '%s'" % (embedded_fields.get('location'), item.get('georss_point')) if not pt: self.log('%s - No georeference found' % (item['id']), log.WARNING) continue pt = re.split ("[, ]+", pt) l.add_value ('lat', pt[0]) l.add_value ('lng', pt[1]) l.add_value ('kml_url', embedded_fields.get('kml') or '') for link in item['links']: if link['rel'] == 'alternate': l.add_value ('link', link['href']) l.add_value ('source_id', feed['source_id']) yield l.load_item() if 'tags' in item: for t in item['tags']: l=ItemLoader (FeedEntryTag()) l.add_value ('feed_entry_id', feed_entry_id) l.add_value ('tag', t['term']) l.add_value ('comment', t['label']) yield l.load_item() l=ItemLoader (FeedEntryTag()) l.add_value ('feed_entry_id', feed_entry_id) l.add_value ('tag', feed['tag']) yield l.load_item() # update task status self.item_completed (task_id)
def process_item(self, task_id): feed = self.db.getRssFeeds(task_id) self.log('processsing rss feed %s (%s)' % (feed['id'], feed['url']), log.INFO) # parse feed feed_data = feedparser.parse(feed['url']) self.log('reading %s feed items' % (len(feed_data['items'])), log.INFO) # update last read time self.db.updateRssFeedLastRead(task_id) # For each item in feed for item in feed_data['items']: # if this item has not already been processed if self.db.rssFeedItemExists(item['id']): self.log( '%s - feed item already exists - skipping' % (item['id']), log.INFO) continue # store the full item l = ItemLoader(RssFeedItem()) l.add_value('item_id', item['id']) l.add_value('content', psycopg2.Binary(pickle.dumps(item))) #l.add_value ('content', pickle.dumps(item)) l.add_value('feed_id', task_id) yield l.load_item() #feed_entry_id = uuid.uuid5(uuid.NAMESPACE_URL, str(item ['id'])) feed_entry_id = self.db.uuid5_str(name=str(item['id'])) l = ItemLoader(FeedEntry()) l.add_value('id', feed_entry_id) l.add_value('title', item['title']) # l.add_value ('updated', format_datetime(item['updated_parsed'])) l.add_value('incident_datetime', format_datetime(item['updated_parsed'])) if 'content' in item: for c in item['content']: l.add_value('content', c['value']) elif 'summary' in item: l.add_value('content', item['summary']) embedded_fields = self.extractContentFields( l.get_output_value('content')) pt = embedded_fields.get('location') or item.get('georss_point') # print "em: '%s' geo: '%s'" % (embedded_fields.get('location'), item.get('georss_point')) if not pt: self.log('%s - No georeference found' % (item['id']), log.WARNING) continue pt = re.split("[, ]+", pt) l.add_value('lat', pt[0]) l.add_value('lng', pt[1]) l.add_value('kml_url', embedded_fields.get('kml') or '') for link in item['links']: if link['rel'] == 'alternate': l.add_value('link', link['href']) l.add_value('source_id', feed['source_id']) yield l.load_item() if 'tags' in item: for t in item['tags']: l = ItemLoader(FeedEntryTag()) l.add_value('feed_entry_id', feed_entry_id) l.add_value('tag', t['term']) l.add_value('comment', t['label']) yield l.load_item() l = ItemLoader(FeedEntryTag()) l.add_value('feed_entry_id', feed_entry_id) l.add_value('tag', feed['tag']) yield l.load_item() # update task status self.item_completed(task_id)