Exemplos de data_path em Python, exemplos de scrapy.utils.project.data_path em Python

Exemplo n.º 1

0

Exibir arquivo

 def test_data_path_inside_project(self):
     with inside_a_project() as proj_path:
         expected = os.path.join(proj_path, '.scrapy', 'somepath')
         self.assertEqual(os.path.realpath(expected),
                          os.path.realpath(data_path('somepath')))
         abspath = os.path.join(os.path.sep, 'absolute', 'path')
         self.assertEqual(abspath, data_path(abspath))

Exemplo n.º 2

0

Exibir arquivo

Arquivo: test_utils_project.py Projeto: ArturGaspar/scrapy

 def test_data_path_outside_project(self):
     self.assertEqual(
         os.path.join('.scrapy', 'somepath'),
         data_path('somepath')
     )
     abspath = os.path.join(os.path.sep, 'absolute', 'path')
     self.assertEqual(abspath, data_path(abspath))

Exemplo n.º 3

0

Exibir arquivo

Arquivo: test_utils_project.py Projeto: Parlin-Galanodel/scrapy

 def test_data_path_inside_project(self):
     with inside_a_project() as proj_path:
         expected = os.path.join(proj_path, '.scrapy', 'somepath')
         self.assertEqual(
             os.path.realpath(expected),
             os.path.realpath(data_path('somepath'))
         )
         self.assertEqual('/absolute/path', data_path('/absolute/path'))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: __init__.py Projeto: andreibastos/ze-the-scraper

class GoogleCloud(object):
    credentials_json_path = ''.join(
        (data_path('auth/', True), 'google-service-account.json'))

    @classmethod
    def from_crawler(cls, crawler):
        return cls(settings=crawler.settings)

    def __init__(self, settings):
        self.google_cloud_enabled = settings.getbool('GOOGLE_CLOUD_ENABLED')

        if self.google_cloud_enabled:
            credentials_json = settings.get(
                'GOOGLE_CLOUD_APPLICATION_CREDENTIALS_JSON')
            if credentials_json:
                if not os.path.isfile(self.credentials_json_path):
                    with open(self.credentials_json_path, 'w') as outfile:
                        outfile.write(credentials_json)

                os.environ[
                    'GOOGLE_APPLICATION_CREDENTIALS'] = self.credentials_json_path
                logger.info('Google Cloud extensions inited with success')
            else:
                settings.set('GOOGLE_CLOUD_ENABLED', False)
                raise NotConfigured(
                    'GOOGLE_CLOUD_APPLICATION_CREDENTIALS_JSON '
                    'not is set in settings')

        else:
            logger.info('GOOGLE_CLOUD_ENABLED is False')

    def close_spider(self, spider):
        if self.google_cloud_enabled \
        and os.path.isfile(self.credentials_json_path):
            os.remove(self.credentials_json_path)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: redis_cachestorage.py Projeto: animalmatsuzawa/scrapy-cluster

 def __init__(self, settings):
     self.redis_conn = redis.Redis(
                             host=settings.get('REDIS_HOST'),
                             port=settings.get('REDIS_PORT'))
     self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.persist = settings.get('SCHEDULER_PERSIST', True)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: deltafetch.py Projeto: mt3/scrapylib

 def from_crawler(cls, crawler):
     s = crawler.settings
     if not s.getbool('DELTAFETCH_ENABLED'):
         raise NotConfigured
     dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch'))
     dbmodule = s.get('DELTAFETCH_DBM_MODULE', 'anydbm')
     return cls(dir, dbmodule)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: httpcache.py Projeto: CzaOrz/sourceCodeLearning

 def __init__(self, settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.dbmodule = import_module(
         settings['HTTPCACHE_DBM_MODULE']
     )  # dbm，有这样的模块吗。数据存储，终究还是免不了I/O，open是必不可少的呀
     self.db = None

Exemplo n.º 8

0

Exibir arquivo

Arquivo: httpcache.py Projeto: SilverNight7350/learn

 def __init__(self, settings):
     warn("The LevelDB storage backend is deprecated.",
          ScrapyDeprecationWarning,
          stacklevel=2)
     import leveldb
     self._leveldb = leveldb
     self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.db = None

Exemplo n.º 9

0

Exibir arquivo

Arquivo: deltafetch.py Projeto: alexchung1233/HezbollahScrapper

 def from_crawler(cls, crawler):
     s = crawler.settings
     if not s.getbool('DELTAFETCH_ENABLED'):
         raise NotConfigured
     dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch'))
     reset = s.getbool('DELTAFETCH_RESET')
     o = cls(dir, reset, crawler.stats)
     crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
     crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
     return o

Exemplo n.º 10

0

Exibir arquivo

 def from_crawler(cls, crawler):
     s = crawler.settings
     if not s.getbool('DELTAFETCH_ENABLED'):
         raise NotConfigured
     dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch'))
     reset = s.getbool('DELTAFETCH_RESET')
     o = cls(dir, reset, crawler.stats)
     crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
     crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
     return o

Exemplo n.º 11

0

Exibir arquivo

Arquivo: clustering.py Projeto: NamiStudio/portia

 def from_crawler(cls, crawler):
     s = crawler.settings
     if not s.getbool('PERSISTENT_PAGE_CLUSTERING'):
         raise NotConfigured
     directory = data_path(s.get('CLUSTERING_DIR', 'clustering'))
     reset = s.getbool('CLUSTERING_RESET')
     o = cls(directory, reset, crawler.stats)
     crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
     crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
     return o

Exemplo n.º 12

0

Exibir arquivo

Arquivo: persist_data_manager.py Projeto: theHausdorffMetric/test

 def __init__(self, filename, save_exec_time=False, *args, **kwargs):
     super(PersistDataManager, self).__init__(*args, **kwargs)
     if filename == '':
         raise PersistDataException(
             'Filename required to persist data on SH')
     self.file_path = data_path(filename + '.json')
     logger.info('Using persistent file : ' + self.file_path)
     self._load()
     if save_exec_time:
         self['spider_exec'] = str(datetime.today())

Exemplo n.º 13

0

Exibir arquivo

    def from_settings(cls, settings, **kwargs):
        from .pybloom import ScalableBloomFilter

        p = settings.get("BLOOMFILTER_PATH", data_path("."))
        ic = settings.get("BLOOMFILTER_SIZE", 5000000)
        ert = settings.get("BLOOMFILTER_ERROR_RATE", 0.001)
        mode = settings.get("BLOOMFILTER_MODE",
                            ScalableBloomFilter.SMALL_SET_GROWTH)

        return cls(path=p, initial_capacity=ic, error_rate=ert, mode=mode)

Exemplo n.º 14

0

Exibir arquivo

 def from_crawler(cls, crawler):
     s = crawler.settings
     if not s.getbool('PERSISTENT_PAGE_CLUSTERING'):
         raise NotConfigured
     directory = data_path(s.get('CLUSTERING_DIR', 'clustering'))
     reset = s.getbool('CLUSTERING_RESET')
     o = cls(directory, reset, crawler.stats)
     crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
     crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
     return o

Exemplo n.º 15

0

Exibir arquivo

Arquivo: common.py Projeto: Aitrg/md_ingestion

 def from_crawler(cls, crawler):
     settings = crawler.settings
     if not settings.getbool('DROP_DUPLICATES_ENABLED'):
         raise NotConfigured
     dir__ = data_path(settings.get('CACHE_DIR', 'cache'))
     dedup = cls(dir__, crawler.stats, settings)
     crawler.signals.connect(dedup.spider_opened,
                             signal=signals.spider_opened)
     crawler.signals.connect(dedup.spider_closed,
                             signal=signals.spider_closed)
     return dedup

Exemplo n.º 16

0

Exibir arquivo

 def from_crawler(cls, crawler):
     s = crawler.settings
     if not s.getbool('CRAWL_ONCE_ENABLED', True):
         raise NotConfigured()
     path = data_path(s.get('CRAWL_ONCE_PATH', 'crawl_once'),
                      createdir=True)
     default = s.getbool('CRAWL_ONCE_DEFAULT', default=False)
     o = cls(path, crawler.stats, default)
     crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
     crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
     return o

Exemplo n.º 17

0

Exibir arquivo

Arquivo: __init__.py Projeto: acordiner/scrapy-s3-cache

 def __init__(self, settings):
     super(S3CacheStorage, self).__init__(settings)
     self.tmpcachedir = data_path(settings.get(
         'S3CACHE_TEMPDIR',
         os.path.join(tempfile.tempdir, '.s3cache'),
     ))
     self.aws_access_key = settings['AWS_ACCESS_KEY_ID']
     self.aws_secret_key = settings['AWS_SECRET_ACCESS_KEY']
     self.bucket_name = settings['S3CACHE_BUCKET']
     if self.bucket_name is None:
         raise NotConfigured("S3CACHE_BUCKET must be specified")
     self._conn = None

Exemplo n.º 18

0

Exibir arquivo

Arquivo: lelivros_spider.py Projeto: tavaresdu/lelivros_spider

    def spider_opened(self):
        s = self.settings
        self.email = EmailNotification(s['EMAIL'], s['PASSWORD'])
        for address in self.addresses:
            self.email.add_address(address)

        self.path = data_path(s['FILE_NAME'])
        try:
            with open(self.path, 'r') as f:
                self.last_url = f.read()
        except Exception:
            self.last_url = 'URL_NULL'

Exemplo n.º 19

0

Exibir arquivo

 def __init__(self, settings):
     super(S3CacheStorage, self).__init__(settings)
     self.tmpcachedir = data_path(
         settings.get(
             'S3CACHE_TEMPDIR',
             os.path.join(tempfile.tempdir, '.s3cache'),
         ))
     self.aws_access_key = settings['AWS_ACCESS_KEY_ID']
     self.aws_secret_key = settings['AWS_SECRET_ACCESS_KEY']
     self.bucket_name = settings['S3CACHE_BUCKET']
     if self.bucket_name is None:
         raise NotConfigured("S3CACHE_BUCKET must be specified")
     self._conn = None

Exemplo n.º 20

0

Exibir arquivo

Arquivo: persist_data_manager.py Projeto: theHausdorffMetric/test

 def delete_spidersfiles():
     dir_path = data_path('')
     logger.debug('Path of .scrapy dir == [%s]' % dir_path)
     display_list = os.listdir(dir_path)
     logger.debug('{%s}' % str(display_list))
     file_list = [
         f for f in os.listdir(dir_path) if f.endswith('Operator.json')
     ]
     for f in file_list:
         logger.info('We delete file [{}]'.format(f))
         try:
             os.remove(f)
         except OSError as e:
             logger.error('Error: %s - %s.' % (e.filename, e.strerror))

Exemplo n.º 21

0

Exibir arquivo

    def __init__(self, settings):
        self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
        self.sqlite_database = settings['SQLITE_DATABASE']
        self.table = settings.get('SQLITE_REQUESTS_TABLE', connection.SQLITE_REQUESTS_TABLE)
        self.database = settings.get('SQLITE_DATABASE', connection.SQLITE_DATABASE)
        self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')

        self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
        if self.use_gzip:
            self._loads = self._gzip_loads
            self._dumps = self._gzip_dumps
        else:
            self._loads = self._pickle_loads
            self._dumps = self._pickle_dumps

        self.conn = None

Exemplo n.º 22

0

Exibir arquivo

Arquivo: proxies.py Projeto: livecitylab/brl-scrapy

    def parse(self, response):
        # get all proxies and their ports
        proxy_table = response.css('table#proxylisttable')
        proxy_table_rows = proxy_table.xpath('.//tr')[
            1:]  # remove table header

        filename = 'proxies.txt'
        mydata_path = data_path(filename)

        with open(mydata_path, 'w') as fout:
            for row in proxy_table_rows:
                ip = row.xpath('./td[1]/text()').extract_first()
                port = row.xpath('./td[2]/text()').extract_first()
                if ip and port:
                    proxy = ':'.join([ip, port])
                    fout.write(proxy)
                    fout.write('\n')
        fout.close()

Exemplo n.º 23

0

Exibir arquivo

Arquivo: httpcache.py Projeto: 01-/scrapy

 def __init__(self, settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'])
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
     self._open = gzip.open if self.use_gzip else open

Exemplo n.º 24

0

Exibir arquivo

Arquivo: httpcache.py Projeto: bihicheng/scrapy

 def __init__(self, settings=conf.settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'])
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.dbmodule = __import__(settings['HTTPCACHE_DBM_MODULE'])
     self.dbs = {}

Exemplo n.º 25

0

Exibir arquivo

Arquivo: local_storage.py Projeto: rennerocha/spidermon

 def _stats_location(self, spider):
     statsdir = data_path("stats", createdir=True)
     return os.path.join(statsdir, f"{spider.name}_stats_history")

Exemplo n.º 26

0

Exibir arquivo

Arquivo: sqlite.py Projeto: zanachka/scrapy-cookies

 def __init__(self, settings: Settings):
     super(SQLiteStorage, self).__init__(settings)
     self.cookies_dir: str = data_path(settings["COOKIES_PERSISTENCE_DIR"])
     self.database: str = settings["COOKIES_SQLITE_DATABASE"]
     self.conn: Connection = None
     self.cur: Cursor = None

Exemplo n.º 27

0

Exibir arquivo

 def __init__(self, settings: Settings):
     super(InMemoryStorage, self).__init__()
     self.settings: Settings = settings
     self.cookies_dir: str = data_path(settings["COOKIES_PERSISTENCE_DIR"])

Exemplo n.º 28

0

Exibir arquivo

Arquivo: httpcache.py Projeto: floppya/scrapy

 def __init__(self, settings=conf.settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'])
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')

Exemplo n.º 29

0

Exibir arquivo

Arquivo: middlewares.py Projeto: hankya/questions

 def __init__(self, settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'])

Exemplo n.º 30

0

Exibir arquivo

Arquivo: test_utils_project.py Projeto: a625687551/scrapy-souce_code

 def test_data_path_inside_project(self):
     with inside_a_project() as proj_path:
         expected = os.path.join(proj_path, '.scrapy', 'somepath')
         self.assertEquals(expected, data_path('somepath'))
         self.assertEquals('/absolute/path', data_path('/absolute/path'))

Exemplo n.º 31

0

Exibir arquivo

Arquivo: httpcache.py Projeto: pyarnold/scrapy

 def __init__(self, settings):
     self.cachedir = data_path(settings["HTTPCACHE_DIR"], createdir=True)
     self.expiration_secs = settings.getint("HTTPCACHE_EXPIRATION_SECS")
     self.dbmodule = import_module(settings["HTTPCACHE_DBM_MODULE"])
     self.db = None

Exemplo n.º 32

0

Exibir arquivo

Arquivo: cli.py Projeto: misspenalty/feeds

def run_cleanup_cache(settings):
    days = int(
        settings.get('FEEDS_CONFIG', {}).get('feeds',
                                             {}).get('cache_expires', 14))
    cleanup_cache(data_path(settings['HTTPCACHE_DIR']),
                  datetime.now() - timedelta(days=days))

Exemplo n.º 33

0

Exibir arquivo

 def __init__(self, settings=conf.settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'])
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.dbmodule = __import__(settings['HTTPCACHE_DBM_MODULE'])
     self.dbs = {}

Exemplo n.º 34

0

Exibir arquivo

Arquivo: statscollectors.py Projeto: itsx/spidermon

    def __init__(self, crawler):
        super(LocalStorageStatsHistoryCollector, self).__init__(crawler)

        statsdir = data_path("stats", createdir=True)
        self.stats_location = os.path.join(
            statsdir, "{}_stats_history".format(crawler.spider.name))

Exemplo n.º 35

0

Exibir arquivo

 def __init__(self, settings: Settings):
     super(FilesystemCacheStorage, self).__init__(settings)
     self.cachedir = data_path(settings["HTTPCACHE_DIR"])
     self.use_gzip = settings.getbool("HTTPCACHE_GZIP")
     self._open = gzip.open if self.use_gzip else open

Exemplo n.º 36

0

Exibir arquivo

Arquivo: httpcache.py Projeto: dreamfrog/jophiel

 def __init__(self, settings):
     super(FilesystemCacheStorage, self).__init__(settings)
     self.cachedir = data_path(self.httpcache_dir.to_value())
     self.expiration_secs = self.httpcach_expiration_secs.to_value()

Exemplo n.º 37

0

Exibir arquivo

Arquivo: httpcache.py Projeto: 01-/scrapy

 def __init__(self, settings):
     import leveldb
     self._leveldb = leveldb
     self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.db = None

Exemplo n.º 38

0

Exibir arquivo

Arquivo: test_utils_project.py Projeto: a625687551/scrapy-souce_code

 def test_data_path_outside_project(self):
     self.assertEquals('.scrapy/somepath', data_path('somepath'))
     self.assertEquals('/absolute/path', data_path('/absolute/path'))

Exemplo n.º 39

0

Exibir arquivo

Arquivo: httpcache.py Projeto: rishipatel/scrapy

 def __init__(self, settings):
     self.cachedir = data_path(settings["HTTPCACHE_DIR"])
     self.expiration_secs = settings.getint("HTTPCACHE_EXPIRATION_SECS")

Exemplo n.º 40

0

Exibir arquivo

Arquivo: httpcache.py Projeto: zwidny/scrapy

 def __init__(self, settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'])
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')

Exemplo n.º 41

0

Exibir arquivo

Arquivo: settings.py Projeto: phongtnit/RISJbot

    'RISJbot.spmiddlewares.refetchcontrol.RefetchControl': 800,
    # Note: Should be after RefetchControl, to ensure that the URLs stored
    #       are the altered "canonical" ones.
    'RISJbot.spmiddlewares.equivalentdomains.EquivalentDomains': 900,
    'RISJbot.spmiddlewares.unwantedcontent.UnwantedContent': 950,
}

# Enable RefetchControl, 8 fetches total, every 3 hours, including a
# trawl of previously-fetched pages for completeness (TN, 2017-03-15)
REFETCHCONTROL_ENABLED = True
REFETCHCONTROL_MAXFETCHES = 8
REFETCHCONTROL_REFETCHSECS = 10800
REFETCHCONTROL_REFETCHFROMDB = True
REFETCHCONTROL_TRIMDB = True
REFETCHCONTROL_RQCALLBACK = 'spider.parse_page'
REFETCHCONTROL_DIR = data_path('RefetchControl', createdir=True)

# Enable UnwantedContent, stripping figures etc. (TN, 2017-02-27)
UNWANTEDCONTENT_ENABLED = True
UNWANTEDCONTENT_XPATHS = [
    '//figure',
    '//script',
    '//style',
    '//form',
]

# Enable Fake404, dropping responses that are actually "page not found",
# but come with an improper HTTP 200 success code. Lookin' at you, foxnews.com.
FAKE404_ENABLED = True
# List of ( url regex, matching xpath ) tuples
FAKE404_DETECTIONSIGS = [

Exemplo n.º 42

0

Exibir arquivo

 def __init__(self, settings):
     import leveldb
     self._leveldb = leveldb
     self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.db = None

Exemplo n.º 43

0

Exibir arquivo

Arquivo: httpcache.py Projeto: zhangtao1987/scrapy

 def __init__(self, settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.dbmodule = import_module(settings['HTTPCACHE_DBM_MODULE'])
     self.db = None

Exemplo n.º 44

0

Exibir arquivo

Arquivo: httpcache.py Projeto: zhangtao1987/scrapy

 def __init__(self, settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'])
     self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
     self._open = gzip.open if self.use_gzip else open

Exemplo n.º 45

0

Exibir arquivo

 def test_data_path_outside_project(self):
     self.assertEqual(os.path.join('.scrapy', 'somepath'),
                      data_path('somepath'))
     abspath = os.path.join(os.path.sep, 'absolute', 'path')
     self.assertEqual(abspath, data_path(abspath))

Exemplo n.º 46

0

Exibir arquivo

Arquivo: test_utils_project.py Projeto: Parlin-Galanodel/scrapy

 def test_data_path_outside_project(self):
     self.assertEqual('.scrapy/somepath', data_path('somepath'))
     self.assertEqual('/absolute/path', data_path('/absolute/path'))

Exemplo n.º 47

0

Exibir arquivo

Arquivo: httpcache.py Projeto: Mimino666/scrapy

 def __init__(self, settings):
     self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
     self.httpcache_expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
     self.dbmodule = __import__(settings['HTTPCACHE_DBM_MODULE'])
     self.db = None

Exemplo n.º 48

0

Exibir arquivo

Arquivo: httpcache.py Projeto: dreamfrog/jophiel

 def __init__(self, settings):
     self.cachedir = data_path(self.httpcache_dir.to_value())
     self.expiration_secs = self.httpcache_expiration_secs.to_value()
     self.dbmodule = __import__(self.httpcache_dbm_module.to_value())
     self.dbs = {}