Exemplo n.º 1
0
    def process_item(self, item: Item, spider: scrapy.Spider):

        if not isinstance(item, Item):
            spider.log("Invalid item type")
            return

        filename = None
        basepath = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'items'))

        if isinstance(item, Tune):
            basepath = os.path.join(basepath, item.artistId)

        if not os.path.isdir(basepath):
            Path(basepath).mkdir(parents=True, exist_ok=True)

        if isinstance(item, Tune):
            nitem = item
            # Remove filesystem unfriendly characters
            nitem.artist = validatechars(nitem.artist)
            nitem.title = validatechars(nitem.title)
            filename = f"{nitem.artist} - {nitem.title}.{nitem.format}"

        if filename is None:
            raise ValueError("No filename")

        # Save to temporary file
        tmpf = NamedTemporaryFile("wb", prefix="amp-", suffix=f".bin", delete=False)
        with tmpf as f:
            f.write(item.data)
            f.flush()
            spider.logger.info(f"saved as {f.name}")

        # Rename and move the temporary file to actual file
        newpath = move(tmpf.name, os.path.join(basepath, filename))
        spider.logger.info(f"renamed {tmpf.name} to {newpath}")
Exemplo n.º 2
0
    def process_item(self, item: Item, spider: scrapy.Spider):
        if not isinstance(item, Item):
            spider.log("invalid item type {0}".format(type(item)))
            return

        if isinstance(item, Memory):
            self.items.append(item)
Exemplo n.º 3
0
    def process_request(self, request: Request, spider: Spider):
        """
         参考: https://www.jianshu.com/p/d64b13a2322b
         https://docs.scrapy.org/en/latest/topics/downloader-
         middleware.html#writing-your-own-downloader-middleware
        :param request:
        :param spider:
        :return:
        """
        try:
            spider.log('Chrome driver begin...')
            self.driver.implicitly_wait(3)
            self.driver.set_script_timeout(5)
            self.driver.set_page_load_timeout(5)
            self.driver.get(request.url)  # 获取网页链接内容
            return HtmlResponse(url=request.url,
                                body=self.driver.page_source,
                                request=request,
                                encoding='utf-8',
                                status=200)  # 返回HTML数据
        except TimeoutException:
            self.sqlite.update_retry_time_field(self.proxy_ip)
            retry_times: int = self.sqlite.fetch_value_of_retry_time(
                self.proxy_ip)
            # 该代理ip重试两次后依旧超时,标记为无效
            if retry_times >= 2:
                self.sqlite.update_is_ok_field(self.proxy_ip)

            spider.log('Chrome driver end...')
            return HtmlResponse(url=request.url,
                                request=request,
                                encoding='utf-8',
                                status=500)
Exemplo n.º 4
0
 def process_item(self, item: dict, spider: Spider):
     if isinstance(spider, RssFeedBaseSpider):
         _item = item.copy()
         pub_date_tz = parse_date(_item['pub_date'])
         _item['pub_date'] = pub_date_tz.astimezone(tz=UTC)
         try:
             with self.db.transaction():
                 RssNews.create(**_item)
         except DatabaseError as err:
             spider.log('Raise DatabaseError: {0}'.format(err), level=ERROR)
     return item
Exemplo n.º 5
0
    def process_item(self, item: Item, spider: scrapy.Spider):
        if not isinstance(item, Item):
            spider.log("Invalid item type")
            return

        filename = None
        basepath = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..', 'items'))

        if isinstance(item, Tune):
            if item.data is None:
                spider.logger.info(f"Data is none")
                return

            if len(item.data) == 0:
                spider.logger.info(f"No data")
                return

            nitem: Tune = item
            # Remove filesystem unfriendly characters
            nitem.arranger = validatechars(nitem.arranger)
            nitem.title = validatechars(nitem.title)
            nitem.composer = validatechars(nitem.composer)

            filename = f"{nitem.arranger} - {nitem.title} [{nitem.composer}] {nitem.added.strftime('%Y-%m-%d')}.mp3"

        if filename is None:
            raise ValueError("No filename")

        if not os.path.isdir(basepath):
            Path(basepath).mkdir(parents=True, exist_ok=True)

        # Save to temporary file
        tmpf = NamedTemporaryFile("wb",
                                  prefix="amigaremix-",
                                  suffix=f".mp3",
                                  delete=False)
        with tmpf as f:
            f.write(item.data)
            f.flush()
            spider.logger.info(f"saved as {f.name}")

        # Rename and move the temporary file to actual file
        newpath = move(tmpf.name, os.path.join(basepath, filename))
        spider.logger.info(f"renamed {tmpf.name} to {newpath}")
Exemplo n.º 6
0
    def open(self, spider: Spider) -> None:
        self.spider = spider

        try:
            self.queue = load_object(self.queue_cls)(
                server=self.server,
                spider=spider,
                key=self.queue_key % {
                    'spider': spider.name
                },
                serializer=self.serializer,
            )
        except TypeError as e:
            raise ValueError("Failed to instantiate queue class '%s': %s",
                             self.queue_cls, e)

        self.df = load_object(self.dupefilter_cls).from_spider(spider)

        if self.flush_on_start:
            self.flush()
        # notice if there are requests already in the queue to resume the crawl
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" %
                       len(self.queue))
Exemplo n.º 7
0
 def open_spider(self, spider: scrapy.Spider):
     """Creates initializes the output folders to store the
     comment items.
     """
     try:
         os.mkdir('data')
         spider.log(' Directory data/ created', level=logging.INFO)
     except FileExistsError:
         spider.log(' Directory data/ already exists', level=logging.INFO)
     os.mkdir('data/{}-{}'.format(spider.name, self.key))
     spider.log(' Directory data/{}-{} created'.format(
         spider.name, self.key),
                level=logging.INFO)
     filename = 'data/{0}-{1}/part-{2:05d}.jl'.format(
         spider.name, self.key, self.file_index)
     self.file = open(filename, 'a')
Exemplo n.º 8
0
 def process_item(self, item: scrapy.Item, spider: scrapy.Spider) -> List:
     for k, v in item.items():
         spider.log('{}: {}'.format(k, v), logging.INFO)
     return item
Exemplo n.º 9
0
 def process_item(self, item: ScraperItem, spider: Spider) -> ScraperItem:
     spider.log(pformat(dict(item), indent=4))
     return item
Exemplo n.º 10
0
 def process_item(self, item, spider):
     self.db[self.collection_name].insert_one(dict(item))
     Spider.log(spider, 'inserted %s' % item['_id'], logging.DEBUG)
     return item
Exemplo n.º 11
0
    def process_item(self, item: Item, spider: Spider):
        """
        重写框架方法
        :param item: 由框架自动传入
        :param spider: 由框架自动传入
        :return:
        """
        if item.__class__ == ZhugefangDetailUrlsItem:
            insert_sql: str = """
            INSERT INTO python_detail_urls
            (building_url, city_id,
            building_from, is_new, commit_time)
            values 
            ('{building_url}','{city_id}',
            '{building_from}','{is_new}','{commit_time}')
            """.format(building_url=item['comm_url'],
                       city_id=item['city_id'],
                       building_from=item['comm_from'],
                       is_new=item['is_new'],
                       commit_time=item['commit_time'])
            try:
                spider.log('SQL Prepared: ' + insert_sql)
                self.cursor.execute(insert_sql)
            except MySQLError:
                import traceback
                traceback.print_exc()
                self.cursor.close()
                self.conn.close()
                spider.log('The DB connection has closed!')
            else:
                self.conn.commit()
            return item
        elif item.__class__ == ZhugefangOldItem:
            insert_sql: str = """INSERT INTO python_project 
            (pj_name, addr, avg_price,
            
            const_era, property_desc,
            plot_ratio, greening,
            buildings_amount, houses_amount, 
            property_price, property,
            develop, const_area, building_type,

            city_id, building_from,
            building_url, is_new, commit_time)
            VALUES 
            ('{pj_name}', '{addr}', '{avg_price}',

            '{const_era}','{property_desc}',
            '{plot_ratio}','{greening}',
            '{buildings_amount}','{houses_amount}',
            '{property_price}','{property}',
            '{develop}','{const_area}','{building_type}',
            
            '{city_id}','{building_from}',
            '{building_url}','{is_new}','{commit_time}')""".format(
                pj_name=item['comm_name'],
                addr=item['comm_addr'],
                avg_price=item['comm_price'],
                const_era=item['const_era'],
                property_desc=item['property_desc'],
                plot_ratio=item['plot_ratio'],
                greening=item['greening_ratio'],
                buildings_amount=item['buildings_amount'],
                houses_amount=item['houses_amount'],
                property_price=item['property_fee'],
                property=item['property_comp'],
                develop=item['dev_comp'],
                const_area=item['const_area'],
                building_type=item['const_type'],
                city_id=item['city_id'],
                building_from=item['comm_from'],
                building_url=item['comm_url'],
                is_new=item['is_new'],
                commit_time=item['commit_time'])

            try:
                spider.log('SQL Prepared: ' + insert_sql)
                self.cursor.execute(insert_sql)
            except MySQLError:
                import traceback
                traceback.print_exc()
                self.cursor.close()
                self.conn.close()
                spider.log('The DB connection has closed!')
            else:
                self.conn.commit()
            return item