Пример #1
0
def get_application(config=None):
    """Overide default get_application in Scrapy."""
    if config is None:
        config = Config()
        # Override http_port by $PORT environment variable in Heroku.
        # Override bind_address to 0.0.0.0 if $PORT exists
        # Note that the http_port has to be a string intead of int.
        config.cp['scrapyd'].update(
            http_port=os.environ.get('PORT', config.get('http_port')),
            bind_address='0.0.0.0' if os.environ.get('PORT') else config.get('bind_address')
        )

    apppath = config.get('application', 'scrapyd.app.application')
    appfunc = load_object(apppath)
    return appfunc(config)
Пример #2
0
def get_spider_list(project, runner=None, pythonpath=None, version=''):
    """Return the spider list from the given project, using the given runner"""
    if "cache" not in get_spider_list.__dict__:
        get_spider_list.cache = UtilsCache()
    try:
        return get_spider_list.cache[project][version]
    except KeyError:
        pass
    if runner is None:
        runner = Config().get('runner')
    env = os.environ.copy()
    env['PYTHONIOENCODING'] = 'UTF-8'
    env['SCRAPY_PROJECT'] = project
    if pythonpath:
        env['PYTHONPATH'] = pythonpath
    if version:
        env['SCRAPY_EGG_VERSION'] = version
    pargs = [sys.executable, '-m', runner, 'list']
    proc = Popen(pargs, stdout=PIPE, stderr=PIPE, env=env)
    out, err = proc.communicate()
    if proc.returncode:
        msg = err or out or ''
        msg = msg.decode('utf8')
        raise RuntimeError(msg.encode('unicode_escape') if six.PY2 else msg)
    # FIXME: can we reliably decode as UTF-8?
    # scrapy list does `print(list)`
    tmp = out.decode('utf-8').splitlines()
    try:
        project_cache = get_spider_list.cache[project]
        project_cache[version] = tmp
    except KeyError:
        project_cache = {version: tmp}
    get_spider_list.cache[project] = project_cache
    return tmp
Пример #3
0
 def setUp(self):
     d = self.mktemp()
     os.mkdir(d)
     config = Config(values={'eggs_dir': d, 'logs_dir': d})
     config.cp.add_section('settings')
     config.cp.set('settings', 'newbot', 'newbot.settings')
     self.environ = Environment(config, initenv={})
Пример #4
0
def get_spider_list(project, runner=None, pythonpath=None, version=''):
    """Return the spider list from the given project, using the given runner"""
    if "cache" not in get_spider_list.__dict__:
        get_spider_list.cache = UtilsCache()
    try:
        return get_spider_list.cache[project][version]
    except KeyError:
        pass
    if runner is None:
        runner = Config().get('runner')
    env = os.environ.copy()
    env['SCRAPY_PROJECT'] = project
    if pythonpath:
        env['PYTHONPATH'] = pythonpath
    if version:
        env['SCRAPY_EGG_VERSION'] = version
    pargs = [sys.executable, '-m', runner, 'list']
    proc = Popen(pargs, stdout=PIPE, stderr=PIPE, env=env)
    out, err = proc.communicate()
    if proc.returncode:
        msg = err or out or 'unknown error'
        raise RuntimeError(msg.splitlines()[-1])
    tmp = out.splitlines()
    try:
        project_cache = get_spider_list.cache[project]
        project_cache[version] = tmp
    except KeyError:
        project_cache = {version: tmp}
    get_spider_list.cache[project] = project_cache
    return tmp
Пример #5
0
 def test_egg_config_application(self):
     config = Config()
     eggstore = 'scrapyd.tests.test_eggstorage.SomeFakeEggStorage'
     config.cp.set('scrapyd', 'eggstorage', eggstore)
     app = application(config)
     app_eggstorage = app.getComponent(IEggStorage)
     assert isinstance(app_eggstorage, SomeFakeEggStorage)
     app_eggstorage.list_projects() == ['hello_world']
Пример #6
0
 def setUp(self):
     d = self.mktemp()
     config = Config(values={'dbs_dir': d, 'finished_to_keep': '2'})
     self.jobst = MemoryJobStorage(config)
     self.j1, self.j2, self.j3 = j1, j2, j3
     self.jobst.add(self.j1)
     self.jobst.add(self.j2)
     self.jobst.add(self.j3)
Пример #7
0
 def test_get_environment_with_no_items_dir(self):
     config = Config(values={'items_dir': '', 'logs_dir': ''})
     config.cp.add_section('settings')
     config.cp.set('settings', 'newbot', 'newbot.settings')
     msg = {'_project': 'mybot', '_spider': 'myspider', '_job': 'ID'}
     slot = 3
     environ = Environment(config, initenv={})
     env = environ.get_environment(msg, slot)
     self.failUnless('SCRAPY_FEED_URI' not in env)
     self.failUnless('SCRAPY_LOG_FILE' not in env)
Пример #8
0
 def setUp(self):
     d = self.mktemp()
     eggs_dir = os.path.join(d, 'eggs')
     dbs_dir = os.path.join(d, 'dbs')
     os.makedirs(eggs_dir)
     os.makedirs(dbs_dir)
     os.makedirs(os.path.join(eggs_dir, 'mybot1'))
     os.makedirs(os.path.join(eggs_dir, 'mybot2'))
     config = Config(values={'eggs_dir': eggs_dir, 'dbs_dir': dbs_dir})
     self.queues = get_spider_queues(config)
     self.poller = QueuePoller(config)
Пример #9
0
 def list(project):
     eggdir = path.join(Config().get("eggs_dir"), project)
     versions = {
         path.splitext(path.basename(x))[0]:
         hashlib.md5(open(x, 'rb').read()).hexdigest()
         for x in glob("%s/*.egg" % eggdir)
     }
     return [{
         "version": version,
         "checksum": versions[version]
     } for version in sorted(versions.keys(), key=LooseVersion)]
Пример #10
0
def get_spider_list(project, runner=None):
    """Return the spider list from the given project, using the given runner"""
    if runner is None:
        runner = Config().get('runner')
    env = os.environ.copy()
    env['SCRAPY_PROJECT'] = project
    pargs = [sys.executable, '-m', runner, 'list']
    proc = Popen(pargs, stdout=PIPE, stderr=PIPE, env=env)
    out, err = proc.communicate()
    if proc.returncode:
        msg = err or out or 'unknown error'
        raise RuntimeError(msg.splitlines()[-1])
    return out.splitlines()
Пример #11
0
 def test_get_environment_with_logfile(self):
     config = Config(
         values={
             'items_dir': '',
             'logs_dir': '.',
             'logs_filename': '{project}-{spider}-{job}-{Y}{m}{d}T{H}{M}{S}'
         })
     msg = {'_project': 'mybot', '_spider': 'myspider', '_job': 'ID'}
     slot = 3
     environ = Environment(config, initenv={})
     now = datetime.datetime.now()
     env = environ.get_environment(msg, slot)
     expected_logfilename = now.strftime("mybot-spider-%Y%m%dT%H%M%S")
     self.assert_(env['SCRAPY_LOG_FILE'], expected_logfilename)
Пример #12
0
def main():
    logging.basicConfig(level=logging.INFO)
    config = Config()
    Register(to_bytes('http://%s:%d' % (config.get('bind_address', '127.0.0.1'), config.getint('http_port', 6800))),
             config.get('register_path', '/scrapyd-cluster/worker'),
             hosts=config.get('zookeeper_hosts', '127.0.0.1:2181'))

    argv[1:1] = ['-n', '-y', join(dirname(scrapyd.__file__), 'txapp.py')]
    run()
Пример #13
0
def get_spider_list(project, runner=None, pythonpath=None, version=''):
    """Return the spider list from the given project, using the given runner"""
    if "cache" not in get_spider_list.__dict__:
        get_spider_list.cache = UtilsCache()
    try:
        return get_spider_list.cache[project][version]
    except KeyError:
        pass
    if runner is None:
        runner = Config().get('runner')
    env = os.environ.copy()
    env['PYTHONIOENCODING'] = 'UTF-8'
    #env['SCRAPY_PROJECT'] = project   change to:   because encoding is utf-8, this would not be necessary in python 3?
    env['SCRAPY_PROJECT'] = str(project.decode('utf8'))
    print "type of env [scrapy_prokject]: ", type(env['SCRAPY_PROJECT'])
    #print env
    if pythonpath:
        env['PYTHONPATH'] = pythonpath
        print pythonpath
        #env['PYTHONPATH'] = str(pythonpath.decode('utf8'))
    if version:
        env['SCRAPY_EGG_VERSION'] = version
        print version
        #env['SCRAPY_EGG_VERSION'] = str(version.decode('utf8'))
        #env['SCRAPY_EGG_VERSION'] = version change to ^   because encoding is utf-8
    print
    pargs = [sys.executable, '-m', runner, 'list']
    proc = Popen(pargs, stdout=PIPE, stderr=PIPE, env=env)
    out, err = proc.communicate()
    #if proc.returncode:
    #    msg = err or out or ''
    #    msg = msg.decode('utf8')
    #    raise RuntimeError(msg.encode('unicode_escape') if six.PY2 else msg)
    # FIXME: can we reliably decode as UTF-8?
    # scrapy list does `print(list)`
    tmp = out.decode('utf-8').splitlines()
    print tmp
    try:
        project_cache = get_spider_list.cache[project]
        project_cache[version] = tmp
    except KeyError:
        project_cache = {version: tmp}
    get_spider_list.cache[project] = project_cache
    return tmp
Пример #14
0
def parse_spider_log(project, spider, jobid, keyword):
    import re
    logdir = Config().get('logs_dir')
    file_name = os.path.join(
        os.path.join(logdir, os.path.join(project, spider)), jobid + '.log')
    logs, res = '', {}
    if os.path.exists(file_name):
        with open(file_name, 'r') as f:
            data = f.readline()
            while data:
                if keyword in data: logs = data
                data = f.readline()
    else:
        return res

    if logs:
        match_str = re.search('{.*?}\n$', logs)
        if match_str: res = eval(match_str.group().strip())
    return res
Пример #15
0
def _get_config():
    datadir = os.path.join(project_data_dir(), 'scrapyd')
    conf = {
        'eggs_dir': os.path.join(datadir, 'eggs'),
        'logs_dir': os.path.join(datadir, 'logs'),
        'items_dir': os.path.join(datadir, 'items'),
        'dbs_dir': os.path.join(datadir, 'dbs'),
    }
    for k in ['eggs_dir', 'logs_dir', 'items_dir', 'dbs_dir']:  # create dirs
        d = conf[k]
        if not os.path.exists(d):
            os.makedirs(d)
    scrapyd_conf = """
[scrapyd]
eggs_dir = %(eggs_dir)s
logs_dir = %(logs_dir)s
items_dir = %(items_dir)s
dbs_dir  = %(dbs_dir)s
    """ % conf
    return Config(extra_sources=[StringIO(scrapyd_conf)])
Пример #16
0
 def setUp(self):
     d = self.mktemp()
     config = Config(values={'eggs_dir': d})
     self.eggst = FilesystemEggStorage(config)
Пример #17
0
#Stage 2 Update (Python 3)
from future import standard_library
standard_library.install_aliases()
from builtins import object
import datetime, json
import urllib.request, urllib.parse, http.client
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
from scrapyd.config import Config
scrapyd_config = Config()
scrapyd_port = scrapyd_config.getint('http_port', 6800)
from dynamic_scraper.models import Scraper

class TaskUtils(object):

    conf = {
        "MAX_SPIDER_RUNS_PER_TASK": 10,
        "MAX_CHECKER_RUNS_PER_TASK": 25,
    }


    def _run_spider(self, **kwargs):
        param_dict = {
            'project': 'default',
            'spider': kwargs['spider'],
            'id': kwargs['id'],
            'run_type': kwargs['run_type'],
            'do_action': kwargs['do_action']
        }
        params = urllib.parse.urlencode(param_dict)
        headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain"}
Пример #18
0
def get_application(config=None):
    if config is None:
        config = Config()
    apppath = config.get('application', 'scrapyd.app.application')
    appfunc = load_object(apppath)
    return appfunc(config)
Пример #19
0
 def __init__(self):
     cache_db = Config().get("cache_dbs", default=":memory:")
     self.cache_manager = JsonSqliteDict(database=cache_db,
                                         table="utils_cache_manager")
Пример #20
0
#Stage 2 Update (Python 3)
from future import standard_library
standard_library.install_aliases()
from builtins import object
import datetime, json
import urllib.request, urllib.parse, http.client
import os
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
from scrapyd.config import Config
scrapyd_config = Config()
scrapyd_port = scrapyd_config.getint('http_port', 6800)
from dynamic_scraper.models import Scraper


class TaskUtils(object):

    conf = {
        "MAX_SPIDER_RUNS_PER_TASK": 10,
        "MAX_CHECKER_RUNS_PER_TASK": 25,
    }

    def _run_spider(self, **kwargs):
        scrapyd_host = os.environ.get('SCRAPYD_HOST', 'localhost')
        scrapyd_port = os.environ.get('SCRAPYD_PORT', '6800')
        param_dict = {
            'project': 'default',
            'spider': kwargs['spider'],
            'id': kwargs['id'],
            'run_type': kwargs['run_type'],
            'do_action': kwargs['do_action']
Пример #21
0
import os
from scrapyd.eggstorage import FilesystemEggStorage
from scrapyd.config import Config
import urllib2
from poster.encode import multipart_encode
from poster.streaminghttp import register_openers
register_openers()

source_dir = '/kf/scrapyd'

dest_url = 'http://localhost:6801/addversion.json'

source_eggs_dir = os.path.join(source_dir, 'eggs')
source_config = Config({'eggs_dir': source_eggs_dir})
source_egg_storage = FilesystemEggStorage(source_config)
for dir in os.listdir(source_eggs_dir):
    #print dir
    project = dir
    version, egg = source_egg_storage.get(project)
    print project, version
    post_data = {
        'egg': egg,
        'project': project,
        'version': version,
    }
    datagen, headers = multipart_encode(post_data)
    request = urllib2.Request(url=dest_url, headers=headers, data=datagen)
    try:
        res = urllib2.urlopen(request)
    except urllib2.HTTPError as e:
        print 'HTTPError: %s' % e
Пример #22
0
# -*- coding: utf-8 -*-
from scrapyd.config import Config
from SpiderKeeper.scrapyd.app import create_spiderkeeper_application

application = create_spiderkeeper_application(Config())