예제 #1
0
파일: pipelines.py 프로젝트: apehex/gdpyr
    def process_item(
            self,
            item: Item,
            spider: Spider) -> Item:
        """
        Save the whole html page to a text file.

        Parameters
        ----------
        item: Item.
            The scraped item, ie the full web page + meta data.
        spider: Spider.
            The spider, one per document type.

        Returns
        -------
        out: Item.
            The input item, unscathed.
        """
        __provider = ''.join(item.get(
            'provider',
            ['none']))
        __text = ''.join(item.get(
            'text',
            ['']))
        __file_path = os.path.join(
            self._path,
            getattr(spider, 'name', 'default'),
            __provider + '.html')

        with open(__file_path, 'w') as __file:
            __file.write(__text)

        return item
예제 #2
0
 def process_item(self, item: Item, spider):
     item['title'] = item['title'].split()[0]
     if isinstance(item, CommentItem):
         self.collect2.insert_one(dict(item))
     elif isinstance(item, ReviewItem):
         if item.get('num', None):
             item['num'] = int(item['num'].split('.')[1])
             item['actor'] = ''.join(item['actor'].split())
         data = dict(item)
         self.collect.find_one_and_update({'title': item['title']},
                                          {'$set': data},
                                          upsert=True)
     return item