Exemplo n.º 1
0
    def parse_post(self, response):
        post = ItemLoader(item=ArabiaPostItem(), response=response)
        post.default_output_processor = TakeFirst()
        #post.add_xpath('id', '//*[@class="post_content replace_urls"]/@id', MapCompose(int), re=r'(\d+)')
        post.add_xpath('id',
                       '//*[@class="short_url inputtext"]/@value',
                       MapCompose(int),
                       re=r'(\d+)')
        post.add_xpath('title', '//*[@id="nav_title"]/a/text()')
        post.add_xpath('up_votes',
                       '//*[@class="s_upvotes"]/text()',
                       MapCompose(int),
                       re=r'(\d+)')
        post.add_xpath('down_votes',
                       '//*[@class="s_downvotes"]/text()',
                       MapCompose(int),
                       re=r'(\d+)')
        post.add_xpath('points', '//*[@class="post_points ltr"]/text()',
                       MapCompose(int))
        post.add_xpath('author_username',
                       '//*[@class="block username"]/text()')
        post.add_xpath('author_fullname',
                       '//*[@class="block full_name"]/text()',
                       MapCompose(lambda value: value.replace(u'\xa0', u'')))
        post.add_xpath('date', '//*[@class="icon-time"]/../text()')
        post.add_xpath('community',
                       '//*[@class="icon-reorder"]/../a[1]/text()')
        post.add_xpath('topics', '//*[@class="topic"]/text()',
                       MapCompose(string.strip))
        post.add_xpath('url', '//*[@class="short_url inputtext"]/@value')
        post.add_value(
            'type',
            'link' if post.get_xpath('//*[@id="nav_title"]/a/@rel',
                                     TakeFirst()) == 'nofollow' else 'text')
        if post.get_output_value('type') == 'link':
            post.add_xpath('link', '//*[@id="nav_title"]/a/@href')
            post.add_xpath('domain',
                           '//*[@class="post_domain"]/text()',
                           re=r'\((.+?)\)')
        post.add_xpath('content', '//*[@class="post_content replace_urls"]/*',
                       Join('\n'))
        post.add_value('item', 'post')
        yield post.load_item()

        comments = []
        for row in response.selector.xpath(
                '//*[contains(@class, "post_comment")]'):
            comment = ItemLoader(item=ArabiaCommentItem(),
                                 selector=row,
                                 response=response)
            comment.default_output_processor = TakeFirst()
            comment.add_xpath('id', './@id', re=r'(\d+)')
            comment.add_xpath('index',
                              './@class',
                              MapCompose(int),
                              re=r'index(\d+)')
            comment.add_value('post_id', post.get_output_value('id'))
            #comment.add_value('parent_id', '')
            comment.add_xpath('author_username',
                              './/*[@class="comment_user"]/a/text()')
            comment.add_xpath('date', './/*[@class="comment_date"]/text()')
            comment.add_xpath('points',
                              './/*[@class="comment_points ltr"]/text()')
            comment.add_xpath(
                'content',
                './/*[@class="post_content comment_content replace_urls"]/*',
                Join('\n'))
            #comment.add_xpath('url', './/*[@class="comment_short_url"]/a/@href')
            comment.add_value(
                'url', 'https://arabia.io/go/{0}/{1}'.format(
                    post.get_output_value('id'),
                    comment.get_output_value('id')))
            comment.add_value('item', 'comment')
            comments.append(comment)

        for (index, comment) in enumerate(comments):
            if comment.get_output_value('index') == 0:
                comment.add_value('parent_id', 0)
                continue
            for comment_cursor in comments[:index][::-1]:
                if comment_cursor.get_output_value(
                        'index') == comment.get_output_value('index') - 1:
                    comment.add_value('parent_id',
                                      comment_cursor.get_output_value('id'))
                    break

        for comment in comments:
            yield comment.load_item()
Exemplo n.º 2
0
    def process_item(self, task_id):
        feed = self.db.getRssFeeds(task_id)
        self.log('processsing rss feed %s (%s)' % (feed['id'], feed['url']), log.INFO)
        # parse feed
        feed_data = feedparser.parse( feed['url'] )

        self.log('reading %s feed items' % (len(feed_data['items'])), log.INFO)

        # update last read time
        self.db.updateRssFeedLastRead (task_id)

        # For each item in feed
        for item in  feed_data['items']:
            # if this item has not already been processed
            if self.db.rssFeedItemExists(item['id']):
                self.log('%s - feed item already exists - skipping' % (item['id']), log.INFO)
                continue

            # store the full item
            l=ItemLoader (RssFeedItem())
            l.add_value ('item_id', item['id'])
            l.add_value ('content', psycopg2.Binary(pickle.dumps(item)))
            #l.add_value ('content', pickle.dumps(item))
            l.add_value ('feed_id', task_id)
            yield l.load_item()

            #feed_entry_id = uuid.uuid5(uuid.NAMESPACE_URL, str(item ['id']))
            feed_entry_id = self.db.uuid5_str(name=str(item ['id']))

            l=ItemLoader (FeedEntry())
            l.add_value ('id', feed_entry_id)
            l.add_value ('title', item['title'])
#            l.add_value ('updated', format_datetime(item['updated_parsed']))
            l.add_value ('incident_datetime', format_datetime(item['updated_parsed']))
            if 'content' in item:
                for c in  item['content']:
                    l.add_value ('content', c['value'])
            elif 'summary' in item:
                l.add_value ('content', item['summary'])

            embedded_fields = self.extractContentFields (l.get_output_value('content'))

            pt = embedded_fields.get('location') or item.get('georss_point')
#            print "em: '%s'  geo: '%s'" % (embedded_fields.get('location'), item.get('georss_point'))
            if not pt:
                self.log('%s - No georeference found' % (item['id']), log.WARNING)
                continue

            pt = re.split ("[, ]+", pt)
            l.add_value ('lat', pt[0])
            l.add_value ('lng', pt[1])

            l.add_value ('kml_url', embedded_fields.get('kml') or '')

            for link in item['links']:
                if link['rel'] == 'alternate':
                    l.add_value ('link', link['href'])
            l.add_value ('source_id', feed['source_id'])
            yield l.load_item()

            if 'tags' in item:
                for t in item['tags']:
                    l=ItemLoader (FeedEntryTag())
                    l.add_value ('feed_entry_id', feed_entry_id)
                    l.add_value ('tag', t['term'])
                    l.add_value ('comment', t['label'])
                    yield l.load_item()

            l=ItemLoader (FeedEntryTag())
            l.add_value ('feed_entry_id', feed_entry_id)
            l.add_value ('tag', feed['tag'])
            yield l.load_item()

        # update task status
        self.item_completed (task_id)
Exemplo n.º 3
0
    def process_item(self, task_id):
        feed = self.db.getRssFeeds(task_id)
        self.log('processsing rss feed %s (%s)' % (feed['id'], feed['url']),
                 log.INFO)
        # parse feed
        feed_data = feedparser.parse(feed['url'])

        self.log('reading %s feed items' % (len(feed_data['items'])), log.INFO)

        # update last read time
        self.db.updateRssFeedLastRead(task_id)

        # For each item in feed
        for item in feed_data['items']:
            # if this item has not already been processed
            if self.db.rssFeedItemExists(item['id']):
                self.log(
                    '%s - feed item already exists - skipping' % (item['id']),
                    log.INFO)
                continue

            # store the full item
            l = ItemLoader(RssFeedItem())
            l.add_value('item_id', item['id'])
            l.add_value('content', psycopg2.Binary(pickle.dumps(item)))
            #l.add_value ('content', pickle.dumps(item))
            l.add_value('feed_id', task_id)
            yield l.load_item()

            #feed_entry_id = uuid.uuid5(uuid.NAMESPACE_URL, str(item ['id']))
            feed_entry_id = self.db.uuid5_str(name=str(item['id']))

            l = ItemLoader(FeedEntry())
            l.add_value('id', feed_entry_id)
            l.add_value('title', item['title'])
            #            l.add_value ('updated', format_datetime(item['updated_parsed']))
            l.add_value('incident_datetime',
                        format_datetime(item['updated_parsed']))
            if 'content' in item:
                for c in item['content']:
                    l.add_value('content', c['value'])
            elif 'summary' in item:
                l.add_value('content', item['summary'])

            embedded_fields = self.extractContentFields(
                l.get_output_value('content'))

            pt = embedded_fields.get('location') or item.get('georss_point')
            #            print "em: '%s'  geo: '%s'" % (embedded_fields.get('location'), item.get('georss_point'))
            if not pt:
                self.log('%s - No georeference found' % (item['id']),
                         log.WARNING)
                continue

            pt = re.split("[, ]+", pt)
            l.add_value('lat', pt[0])
            l.add_value('lng', pt[1])

            l.add_value('kml_url', embedded_fields.get('kml') or '')

            for link in item['links']:
                if link['rel'] == 'alternate':
                    l.add_value('link', link['href'])
            l.add_value('source_id', feed['source_id'])
            yield l.load_item()

            if 'tags' in item:
                for t in item['tags']:
                    l = ItemLoader(FeedEntryTag())
                    l.add_value('feed_entry_id', feed_entry_id)
                    l.add_value('tag', t['term'])
                    l.add_value('comment', t['label'])
                    yield l.load_item()

            l = ItemLoader(FeedEntryTag())
            l.add_value('feed_entry_id', feed_entry_id)
            l.add_value('tag', feed['tag'])
            yield l.load_item()

        # update task status
        self.item_completed(task_id)