Exemplo n.º 1
0
 def test_data_path_inside_project(self):
     with inside_a_project() as proj_path:
         expected = os.path.join(proj_path, '.scrapy', 'somepath')
         self.assertEqual(os.path.realpath(expected),
                          os.path.realpath(data_path('somepath')))
         abspath = os.path.join(os.path.sep, 'absolute', 'path')
         self.assertEqual(abspath, data_path(abspath))
Exemplo n.º 2
0
 def test_data_path_outside_project(self):
     self.assertEqual(
         os.path.join('.scrapy', 'somepath'),
         data_path('somepath')
     )
     abspath = os.path.join(os.path.sep, 'absolute', 'path')
     self.assertEqual(abspath, data_path(abspath))
Exemplo n.º 3
0
 def test_data_path_inside_project(self):
     with inside_a_project() as proj_path:
         expected = os.path.join(proj_path, '.scrapy', 'somepath')
         self.assertEqual(
             os.path.realpath(expected),
             os.path.realpath(data_path('somepath'))
         )
         self.assertEqual('/absolute/path', data_path('/absolute/path'))
Exemplo n.º 4
0
class GoogleCloud(object):
    credentials_json_path = ''.join(
        (data_path('auth/', True), 'google-service-account.json'))

    @classmethod
    def from_crawler(cls, crawler):
        return cls(settings=crawler.settings)

    def __init__(self, settings):
        self.google_cloud_enabled = settings.getbool('GOOGLE_CLOUD_ENABLED')

        if self.google_cloud_enabled:
            credentials_json = settings.get(
                'GOOGLE_CLOUD_APPLICATION_CREDENTIALS_JSON')
            if credentials_json:
                if not os.path.isfile(self.credentials_json_path):
                    with open(self.credentials_json_path, 'w') as outfile:
                        outfile.write(credentials_json)

                os.environ[
                    'GOOGLE_APPLICATION_CREDENTIALS'] = self.credentials_json_path
                logger.info('Google Cloud extensions inited with success')
            else:
                settings.set('GOOGLE_CLOUD_ENABLED', False)
                raise NotConfigured(
                    'GOOGLE_CLOUD_APPLICATION_CREDENTIALS_JSON '
                    'not is set in settings')

        else:
            logger.info('GOOGLE_CLOUD_ENABLED is False')

    def close_spider(self, spider):
        if self.google_cloud_enabled \
        and os.path.isfile(self.credentials_json_path):
            os.remove(self.credentials_json_path)
 def __init__(self, settings):
     self.redis_conn = redis.Redis(
                             host=settings.get('REDIS_HOST'),
                             port=settings.get('REDIS_PORT'))
     self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.persist = settings.get('SCHEDULER_PERSIST', True)
Exemplo n.º 6
0
 def from_crawler(cls, crawler):
     s = crawler.settings
     if not s.getbool('DELTAFETCH_ENABLED'):
         raise NotConfigured
     dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch'))
     dbmodule = s.get('DELTAFETCH_DBM_MODULE', 'anydbm')
     return cls(dir, dbmodule)
Exemplo n.º 7
0
 def __init__(self, settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.dbmodule = import_module(
         settings['HTTPCACHE_DBM_MODULE']
     )  # dbm,有这样的模块吗。数据存储,终究还是免不了I/O,open是必不可少的呀
     self.db = None
Exemplo n.º 8
0
 def __init__(self, settings):
     warn("The LevelDB storage backend is deprecated.",
          ScrapyDeprecationWarning,
          stacklevel=2)
     import leveldb
     self._leveldb = leveldb
     self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.db = None
Exemplo n.º 9
0
 def from_crawler(cls, crawler):
     s = crawler.settings
     if not s.getbool('DELTAFETCH_ENABLED'):
         raise NotConfigured
     dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch'))
     reset = s.getbool('DELTAFETCH_RESET')
     o = cls(dir, reset, crawler.stats)
     crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
     crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
     return o
Exemplo n.º 10
0
 def from_crawler(cls, crawler):
     s = crawler.settings
     if not s.getbool('DELTAFETCH_ENABLED'):
         raise NotConfigured
     dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch'))
     reset = s.getbool('DELTAFETCH_RESET')
     o = cls(dir, reset, crawler.stats)
     crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
     crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
     return o
Exemplo n.º 11
0
 def from_crawler(cls, crawler):
     s = crawler.settings
     if not s.getbool('PERSISTENT_PAGE_CLUSTERING'):
         raise NotConfigured
     directory = data_path(s.get('CLUSTERING_DIR', 'clustering'))
     reset = s.getbool('CLUSTERING_RESET')
     o = cls(directory, reset, crawler.stats)
     crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
     crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
     return o
Exemplo n.º 12
0
 def __init__(self, filename, save_exec_time=False, *args, **kwargs):
     super(PersistDataManager, self).__init__(*args, **kwargs)
     if filename == '':
         raise PersistDataException(
             'Filename required to persist data on SH')
     self.file_path = data_path(filename + '.json')
     logger.info('Using persistent file : ' + self.file_path)
     self._load()
     if save_exec_time:
         self['spider_exec'] = str(datetime.today())
Exemplo n.º 13
0
    def from_settings(cls, settings, **kwargs):
        from .pybloom import ScalableBloomFilter

        p = settings.get("BLOOMFILTER_PATH", data_path("."))
        ic = settings.get("BLOOMFILTER_SIZE", 5000000)
        ert = settings.get("BLOOMFILTER_ERROR_RATE", 0.001)
        mode = settings.get("BLOOMFILTER_MODE",
                            ScalableBloomFilter.SMALL_SET_GROWTH)

        return cls(path=p, initial_capacity=ic, error_rate=ert, mode=mode)
Exemplo n.º 14
0
 def from_crawler(cls, crawler):
     s = crawler.settings
     if not s.getbool('PERSISTENT_PAGE_CLUSTERING'):
         raise NotConfigured
     directory = data_path(s.get('CLUSTERING_DIR', 'clustering'))
     reset = s.getbool('CLUSTERING_RESET')
     o = cls(directory, reset, crawler.stats)
     crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
     crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
     return o
Exemplo n.º 15
0
 def from_crawler(cls, crawler):
     settings = crawler.settings
     if not settings.getbool('DROP_DUPLICATES_ENABLED'):
         raise NotConfigured
     dir__ = data_path(settings.get('CACHE_DIR', 'cache'))
     dedup = cls(dir__, crawler.stats, settings)
     crawler.signals.connect(dedup.spider_opened,
                             signal=signals.spider_opened)
     crawler.signals.connect(dedup.spider_closed,
                             signal=signals.spider_closed)
     return dedup
Exemplo n.º 16
0
 def from_crawler(cls, crawler):
     s = crawler.settings
     if not s.getbool('CRAWL_ONCE_ENABLED', True):
         raise NotConfigured()
     path = data_path(s.get('CRAWL_ONCE_PATH', 'crawl_once'),
                      createdir=True)
     default = s.getbool('CRAWL_ONCE_DEFAULT', default=False)
     o = cls(path, crawler.stats, default)
     crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
     crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
     return o
Exemplo n.º 17
0
 def __init__(self, settings):
     super(S3CacheStorage, self).__init__(settings)
     self.tmpcachedir = data_path(settings.get(
         'S3CACHE_TEMPDIR',
         os.path.join(tempfile.tempdir, '.s3cache'),
     ))
     self.aws_access_key = settings['AWS_ACCESS_KEY_ID']
     self.aws_secret_key = settings['AWS_SECRET_ACCESS_KEY']
     self.bucket_name = settings['S3CACHE_BUCKET']
     if self.bucket_name is None:
         raise NotConfigured("S3CACHE_BUCKET must be specified")
     self._conn = None
Exemplo n.º 18
0
    def spider_opened(self):
        s = self.settings
        self.email = EmailNotification(s['EMAIL'], s['PASSWORD'])
        for address in self.addresses:
            self.email.add_address(address)

        self.path = data_path(s['FILE_NAME'])
        try:
            with open(self.path, 'r') as f:
                self.last_url = f.read()
        except Exception:
            self.last_url = 'URL_NULL'
Exemplo n.º 19
0
 def __init__(self, settings):
     super(S3CacheStorage, self).__init__(settings)
     self.tmpcachedir = data_path(
         settings.get(
             'S3CACHE_TEMPDIR',
             os.path.join(tempfile.tempdir, '.s3cache'),
         ))
     self.aws_access_key = settings['AWS_ACCESS_KEY_ID']
     self.aws_secret_key = settings['AWS_SECRET_ACCESS_KEY']
     self.bucket_name = settings['S3CACHE_BUCKET']
     if self.bucket_name is None:
         raise NotConfigured("S3CACHE_BUCKET must be specified")
     self._conn = None
Exemplo n.º 20
0
 def delete_spidersfiles():
     dir_path = data_path('')
     logger.debug('Path of .scrapy dir == [%s]' % dir_path)
     display_list = os.listdir(dir_path)
     logger.debug('{%s}' % str(display_list))
     file_list = [
         f for f in os.listdir(dir_path) if f.endswith('Operator.json')
     ]
     for f in file_list:
         logger.info('We delete file [{}]'.format(f))
         try:
             os.remove(f)
         except OSError as e:
             logger.error('Error: %s - %s.' % (e.filename, e.strerror))
Exemplo n.º 21
0
    def __init__(self, settings):
        self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
        self.sqlite_database = settings['SQLITE_DATABASE']
        self.table = settings.get('SQLITE_REQUESTS_TABLE', connection.SQLITE_REQUESTS_TABLE)
        self.database = settings.get('SQLITE_DATABASE', connection.SQLITE_DATABASE)
        self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')

        self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
        if self.use_gzip:
            self._loads = self._gzip_loads
            self._dumps = self._gzip_dumps
        else:
            self._loads = self._pickle_loads
            self._dumps = self._pickle_dumps

        self.conn = None
Exemplo n.º 22
0
    def parse(self, response):
        # get all proxies and their ports
        proxy_table = response.css('table#proxylisttable')
        proxy_table_rows = proxy_table.xpath('.//tr')[
            1:]  # remove table header

        filename = 'proxies.txt'
        mydata_path = data_path(filename)

        with open(mydata_path, 'w') as fout:
            for row in proxy_table_rows:
                ip = row.xpath('./td[1]/text()').extract_first()
                port = row.xpath('./td[2]/text()').extract_first()
                if ip and port:
                    proxy = ':'.join([ip, port])
                    fout.write(proxy)
                    fout.write('\n')
        fout.close()
Exemplo n.º 23
0
 def __init__(self, settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'])
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
     self._open = gzip.open if self.use_gzip else open
Exemplo n.º 24
0
 def __init__(self, settings=conf.settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'])
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.dbmodule = __import__(settings['HTTPCACHE_DBM_MODULE'])
     self.dbs = {}
Exemplo n.º 25
0
 def _stats_location(self, spider):
     statsdir = data_path("stats", createdir=True)
     return os.path.join(statsdir, f"{spider.name}_stats_history")
Exemplo n.º 26
0
 def __init__(self, settings: Settings):
     super(SQLiteStorage, self).__init__(settings)
     self.cookies_dir: str = data_path(settings["COOKIES_PERSISTENCE_DIR"])
     self.database: str = settings["COOKIES_SQLITE_DATABASE"]
     self.conn: Connection = None
     self.cur: Cursor = None
Exemplo n.º 27
0
 def __init__(self, settings: Settings):
     super(InMemoryStorage, self).__init__()
     self.settings: Settings = settings
     self.cookies_dir: str = data_path(settings["COOKIES_PERSISTENCE_DIR"])
Exemplo n.º 28
0
 def __init__(self, settings=conf.settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'])
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
Exemplo n.º 29
0
 def __init__(self, settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'])
 def test_data_path_inside_project(self):
     with inside_a_project() as proj_path:
         expected = os.path.join(proj_path, '.scrapy', 'somepath')
         self.assertEquals(expected, data_path('somepath'))
         self.assertEquals('/absolute/path', data_path('/absolute/path'))
Exemplo n.º 31
0
 def __init__(self, settings):
     self.cachedir = data_path(settings["HTTPCACHE_DIR"], createdir=True)
     self.expiration_secs = settings.getint("HTTPCACHE_EXPIRATION_SECS")
     self.dbmodule = import_module(settings["HTTPCACHE_DBM_MODULE"])
     self.db = None
Exemplo n.º 32
0
def run_cleanup_cache(settings):
    days = int(
        settings.get('FEEDS_CONFIG', {}).get('feeds',
                                             {}).get('cache_expires', 14))
    cleanup_cache(data_path(settings['HTTPCACHE_DIR']),
                  datetime.now() - timedelta(days=days))
Exemplo n.º 33
0
 def __init__(self, settings=conf.settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'])
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.dbmodule = __import__(settings['HTTPCACHE_DBM_MODULE'])
     self.dbs = {}
Exemplo n.º 34
0
    def __init__(self, crawler):
        super(LocalStorageStatsHistoryCollector, self).__init__(crawler)

        statsdir = data_path("stats", createdir=True)
        self.stats_location = os.path.join(
            statsdir, "{}_stats_history".format(crawler.spider.name))
Exemplo n.º 35
0
 def __init__(self, settings: Settings):
     super(FilesystemCacheStorage, self).__init__(settings)
     self.cachedir = data_path(settings["HTTPCACHE_DIR"])
     self.use_gzip = settings.getbool("HTTPCACHE_GZIP")
     self._open = gzip.open if self.use_gzip else open
Exemplo n.º 36
0
 def __init__(self, settings):
     super(FilesystemCacheStorage, self).__init__(settings)
     self.cachedir = data_path(self.httpcache_dir.to_value())
     self.expiration_secs = self.httpcach_expiration_secs.to_value()
Exemplo n.º 37
0
 def __init__(self, settings):
     import leveldb
     self._leveldb = leveldb
     self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.db = None
 def test_data_path_outside_project(self):
     self.assertEquals('.scrapy/somepath', data_path('somepath'))
     self.assertEquals('/absolute/path', data_path('/absolute/path'))
Exemplo n.º 39
0
 def __init__(self, settings):
     self.cachedir = data_path(settings["HTTPCACHE_DIR"])
     self.expiration_secs = settings.getint("HTTPCACHE_EXPIRATION_SECS")
Exemplo n.º 40
0
 def __init__(self, settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'])
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
Exemplo n.º 41
0
    'RISJbot.spmiddlewares.refetchcontrol.RefetchControl': 800,
    # Note: Should be after RefetchControl, to ensure that the URLs stored
    #       are the altered "canonical" ones.
    'RISJbot.spmiddlewares.equivalentdomains.EquivalentDomains': 900,
    'RISJbot.spmiddlewares.unwantedcontent.UnwantedContent': 950,
}

# Enable RefetchControl, 8 fetches total, every 3 hours, including a
# trawl of previously-fetched pages for completeness (TN, 2017-03-15)
REFETCHCONTROL_ENABLED = True
REFETCHCONTROL_MAXFETCHES = 8
REFETCHCONTROL_REFETCHSECS = 10800
REFETCHCONTROL_REFETCHFROMDB = True
REFETCHCONTROL_TRIMDB = True
REFETCHCONTROL_RQCALLBACK = 'spider.parse_page'
REFETCHCONTROL_DIR = data_path('RefetchControl', createdir=True)

# Enable UnwantedContent, stripping figures etc. (TN, 2017-02-27)
UNWANTEDCONTENT_ENABLED = True
UNWANTEDCONTENT_XPATHS = [
    '//figure',
    '//script',
    '//style',
    '//form',
]

# Enable Fake404, dropping responses that are actually "page not found",
# but come with an improper HTTP 200 success code. Lookin' at you, foxnews.com.
FAKE404_ENABLED = True
# List of ( url regex, matching xpath ) tuples
FAKE404_DETECTIONSIGS = [
Exemplo n.º 42
0
 def __init__(self, settings):
     import leveldb
     self._leveldb = leveldb
     self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.db = None
Exemplo n.º 43
0
 def __init__(self, settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.dbmodule = import_module(settings['HTTPCACHE_DBM_MODULE'])
     self.db = None
Exemplo n.º 44
0
 def __init__(self, settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'])
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
     self._open = gzip.open if self.use_gzip else open
Exemplo n.º 45
0
 def test_data_path_outside_project(self):
     self.assertEqual(os.path.join('.scrapy', 'somepath'),
                      data_path('somepath'))
     abspath = os.path.join(os.path.sep, 'absolute', 'path')
     self.assertEqual(abspath, data_path(abspath))
Exemplo n.º 46
0
 def test_data_path_outside_project(self):
     self.assertEqual('.scrapy/somepath', data_path('somepath'))
     self.assertEqual('/absolute/path', data_path('/absolute/path'))
Exemplo n.º 47
0
 def __init__(self, settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
     self.httpcache_expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.dbmodule = __import__(settings['HTTPCACHE_DBM_MODULE'])
     self.db = None
Exemplo n.º 48
0
 def __init__(self, settings):
     self.cachedir = data_path(self.httpcache_dir.to_value())
     self.expiration_secs = self.httpcache_expiration_secs.to_value()
     self.dbmodule = __import__(self.httpcache_dbm_module.to_value())
     self.dbs = {}