示例#1
0
 def __init__(self):
     self.logger = logging.getLogger(self.__class__.__name__)
     self.logger.info("Constructing rabbitmq logger")
     username = get_config().get('rabbitmq', 'username')
     password = get_config().get('rabbitmq', 'password')
     credentials = pika.credentials.PlainCredentials(
         username=username,
         password=password
     )
     host = get_config().get('rabbitmq', 'host')
     parameters = pika.ConnectionParameters(
         host=host,
         port=5672,
         virtual_host='/',
         credentials=credentials
     )
     connection = pika.BlockingConnection(
         parameters=parameters
     )
     channel = connection.channel()
     # we're publishing to two channels, the download request
     # so that a download queue can pick it up
     channel.queue_declare('crisis_download_requests')
     # and a fanout exchange to notify listeners that we've crawled something
     channel.exchange_declare(
         'crisis_crawl',
         type='fanout',
         durable=True
     )
     self.channel = channel
示例#2
0
 def __init__(self):
     self.logger = logging.getLogger(self.__class__.__name__)
     self.logger.info("Constructing rabbitmq logger")
     username = get_config().get('rabbitmq', 'username')
     password = get_config().get('rabbitmq', 'password')
     credentials = pika.credentials.PlainCredentials(
         username=username,
         password=password
     )
     host = get_config().get('rabbitmq', 'host')
     parameters = pika.ConnectionParameters(
         host=host,
         port=5672,
         virtual_host='/',
         credentials=credentials
     )
     connection = pika.BlockingConnection(
         parameters=parameters
     )
     channel = connection.channel()
     # we're publishing to two channels, the download request
     # so that a download queue can pick it up
     channel.queue_declare('crisis_download_requests')
     # and a fanout exchange to notify listeners that we've crawled something
     channel.exchange_declare(
         'crisis_crawl',
         type='fanout'
     )
     self.channel = channel
示例#3
0
 def build(self):
     closest = closest_scrapy_cfg()
     os.chdir(os.path.dirname(closest))
     if not os.path.exists('setup.py'):
         if not self.settings_name:
             self.settings_name = 'default'
         settings = get_config().get('settings', self.settings_name)
         self._create_default_setup_py(settings=settings)
         print 'build egg use [%s] settings' % self.settings_name
     if not self.file_path:
         self.file_path = tempfile.mkdtemp(prefix="scrapydeploy-")
     else:
         if not os.path.exists(self.file_path):
             os.mkdir(self.file_path)
     d = self.file_path
     o = open(os.path.join(d, "stdout"), "wb")
     e = open(os.path.join(d, "stderr"), "wb")
     retry_on_eintr(
         check_call,
         [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d],
         stdout=o,
         stderr=e)
     o.close()
     e.close()
     egg = glob.glob(os.path.join(d, '*.egg'))[0]
     self._rename_egg_name(egg, d)
 def getCrawlInfoAddress(cls):
     settings = get_project_settings()
     localIp = settings.get("CRAWL_IP", HttpUtils.get_ip_address())
     cfg = get_config()
     scrapydUrl = cfg.get('deploy', "url").replace("http://",
                                                   "").replace("/", "")
     address = scrapydUrl.replace("localhost", localIp)
     return address
示例#5
0
 def __init__(self, queuename="default"):
     self.queuename = queuename
     self.seen_queues = []
     self.connection = pika.BlockingConnection(
         pika.ConnectionParameters(conf.get_config().get('RabbitConfig', 'host')))
     self.channel = self.connection.channel()
     self.client_params = {"x-ha-policy": "all"}
     self.channel.queue_declare(queue=self.queuename, arguments=self.client_params)
     self.seen_queues.append(self.queuename)
示例#6
0
def project_data_dir(project='default'):
    """Return the current project data dir, creating it if it doesn't exist"""
    assert inside_project(), "Not inside project"
    scrapy_cfg = closest_scrapy_cfg()
    d = abspath(join(dirname(scrapy_cfg), '.scrapy'))
    cfg = get_config()
    if cfg.has_option(DATADIR_CFG_SECTION, project):
        d = cfg.get(DATADIR_CFG_SECTION, project)
    if not exists(d):
        makedirs(d)
    return d
示例#7
0
文件: deploy.py 项目: robyoung/scrapy
def _build_egg():
    closest = closest_scrapy_cfg()
    os.chdir(os.path.dirname(closest))
    if not os.path.exists("setup.py"):
        settings = get_config().get("settings", "default")
        _create_default_setup_py(settings=settings)
    d = tempfile.mkdtemp()
    f = tempfile.TemporaryFile(dir=d)
    check_call([sys.executable, "setup.py", "clean", "-a", "bdist_egg", "-d", d], stdout=f)
    egg = glob.glob(os.path.join(d, "*.egg"))[0]
    return egg, d
示例#8
0
文件: deploy.py 项目: chzealot/scrapy
def _build_egg():
    closest = closest_scrapy_cfg()
    os.chdir(os.path.dirname(closest))
    if not os.path.exists('setup.py'):
        settings = get_config().get('settings', 'default')
        _create_default_setup_py(settings=settings)
    d = tempfile.mkdtemp()
    f = tempfile.TemporaryFile(dir=d)
    check_call([sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=f)
    egg = glob.glob(os.path.join(d, '*.egg'))[0]
    return egg, d
示例#9
0
def _get_targets():
    cfg = get_config()
    baset = dict(cfg.items('deploy')) if cfg.has_section('deploy') else {}
    baset.setdefault('url', 'http://dash.scrapinghub.com/api/scrapyd/')
    targets = {}
    targets['default'] = baset
    for x in cfg.sections():
        if x.startswith('deploy:'):
            t = baset.copy()
            t.update(cfg.items(x))
            targets[x[7:]] = t
    return targets
示例#10
0
文件: deploy.py 项目: chzealot/scrapy
def _get_targets():
    cfg = get_config()
    baset = dict(cfg.items('deploy')) if cfg.has_section('deploy') else {}
    targets = {}
    if 'url' in baset:
        targets['default'] = baset
    for x in cfg.sections():
        if x.startswith('deploy:'):
            t = baset.copy()
            t.update(cfg.items(x))
            targets[x[7:]] = t
    return targets
示例#11
0
文件: deploy.py 项目: robyoung/scrapy
def _get_targets():
    cfg = get_config()
    baset = dict(cfg.items("deploy")) if cfg.has_section("deploy") else {}
    targets = {}
    if "url" in baset:
        targets["default"] = baset
    for x in cfg.sections():
        if x.startswith("deploy:"):
            t = baset.copy()
            t.update(cfg.items(x))
            targets[x[7:]] = t
    return targets
示例#12
0
def _get_targets():
    cfg = get_config()
    baset = dict(cfg.items('deploy')) if cfg.has_section('deploy') else {}
    targets = {}
    if 'url' in baset:
        targets['default'] = baset
    for x in cfg.sections():
        if x.startswith('deploy:'):
            t = baset.copy()
            t.update(cfg.items(x))
            targets[x[7:]] = t
    return targets
def _get_targets():
    cfg = get_config()
    baset = dict(cfg.items('deploy')) if cfg.has_section('deploy') else {}
    baset.setdefault('url', 'http://dash.scrapinghub.com/api/scrapyd/')
    targets = {}
    targets['default'] = baset
    for x in cfg.sections():
        if x.startswith('deploy:'):
            t = baset.copy()
            t.update(cfg.items(x))
            targets[x[7:]] = t
    return targets
示例#14
0
文件: login.py 项目: pombredanne/shub
def _read_scrapy_cfg_key():
    try:
        from scrapy.utils.conf import get_config
        cfg = get_config()

        if cfg.has_section('deploy'):
            deploy = dict(cfg.items('deploy'))
            key = deploy.get('username')

            if key:
                return key
    except:
        return
示例#15
0
def _build_egg():
    closest = closest_scrapy_cfg()
    os.chdir(os.path.dirname(closest))
    if not os.path.exists('setup.py'):
        settings = get_config().get('settings', 'default')
        _create_default_setup_py(settings=settings)
    d = tempfile.mkdtemp()
    f = tempfile.TemporaryFile(dir=d)
    check_call(
        [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d],
        stdout=f)
    egg = glob.glob(os.path.join(d, '*.egg'))[0]
    return egg, d
示例#16
0
def _build_egg():
    closest = closest_scrapy_cfg()
    os.chdir(os.path.dirname(closest))
    if not os.path.exists('setup.py'):
        settings = get_config().get('settings', 'default')
        _create_default_setup_py(settings=settings)
    d = tempfile.mkdtemp(prefix="scrapydeploy-")
    o = open(os.path.join(d, "stdout"), "wb")
    e = open(os.path.join(d, "stderr"), "wb")
    retry_on_eintr(check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=o, stderr=e)
    o.close()
    e.close()
    egg = glob.glob(os.path.join(d, '*.egg'))[0]
    return egg, d
示例#17
0
def _build_egg():
    closest = closest_scrapy_cfg()
    os.chdir(os.path.dirname(closest))
    if not os.path.exists('setup.py'):
        settings = get_config().get('settings', 'default')
        _create_default_setup_py(settings=settings)
    d = tempfile.mkdtemp(prefix="scrapydeploy-")
    o = open(os.path.join(d, "stdout"), "wb")
    e = open(os.path.join(d, "stderr"), "wb")
    retry_on_eintr(check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=o, stderr=e)
    o.close()
    e.close()
    egg = glob.glob(os.path.join(d, '*.egg'))[0]
    return egg, d
示例#18
0
def _build_egg(keep_build):
    closest = closest_scrapy_cfg()
    os.chdir(os.path.dirname(closest))
    if not os.path.exists('setup.py'):
        settings = get_config().get('settings', 'default')
        _create_default_setup_py(settings=settings)
    d = tempfile.mkdtemp()
    f = tempfile.TemporaryFile(dir=d)
    retry_on_eintr(check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=f)
    egg = glob.glob(os.path.join(d, '*.egg'))[0]
    if not keep_build:
        shutil.rmtree('build')
        shutil.rmtree('project.egg-info')
    return egg, d
示例#19
0
def project_data_dir(project='default'):
    """Return the current project data dir, creating it if it doesn't exist"""
    if not inside_project():
        raise NotConfigured("Not inside a project")
    cfg = get_config()
    if cfg.has_option(DATADIR_CFG_SECTION, project):
        d = cfg.get(DATADIR_CFG_SECTION, project)
    else:
        scrapy_cfg = closest_scrapy_cfg()
        if not scrapy_cfg:
            raise NotConfigured("Unable to find scrapy.cfg file to infer project data dir")
        d = abspath(join(dirname(scrapy_cfg), '.scrapy'))
    if not exists(d):
        os.makedirs(d)
    return d
示例#20
0
def _build_egg():
    closest = closest_scrapy_cfg()
    os.chdir(os.path.dirname(closest))
    if not os.path.exists('setup.py'):
        scrapy_project_settings = get_config()
        settings = scrapy_project_settings.get('settings', 'default')
        project = scrapy_project_settings.get('deploy', 'project')
        _create_default_setup_py(settings=settings, project=project)
    d = 'dist'
    retry_on_eintr(
        check_call,
        [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d],
        stdout=sys.stdout,
        stderr=sys.stderr)
    egg = glob.glob(os.path.join(d, '*.egg'))[0]
    return egg, d
示例#21
0
文件: shell.py 项目: bf96163/scrapy
 def start(self,
           url=None,
           request=None,
           response=None,
           spider=None,
           redirect=True):
     # disable accidental Ctrl-C key press from shutting down the engine
     signal.signal(signal.SIGINT, signal.SIG_IGN)
     if url:
         self.fetch(url, spider, redirect=redirect)  #有URL就爬URL
     elif request:
         self.fetch(request, spider)  # 有request 就爬request
     elif response:
         request = response.request
         self.populate_vars(response, request, spider)
     else:
         self.populate_vars()
     if self.code:
         print(eval(
             self.code, globals(),
             self.vars))  # eval就是将字符串 第一个参数是 字符串 第二个是globals 第三个是locals
     else:  #这里 如果没有传入code的话 这里从 环境变量和scrapy.cfg文件中读取和拿到想要的Pythonshell
         """
         Detect interactive shell setting in scrapy.cfg
         e.g.: ~/.config/scrapy.cfg or ~/.scrapy.cfg
         [settings]
         # shell can be one of ipython, bpython or python;
         # to be used as the interactive python console, if available.
         # (default is ipython, fallbacks in the order listed above)
         shell = python
         """
         cfg = get_config()  #scrapy.cfg的配置
         section, option = 'settings', 'shell'
         env = os.environ.get('SCRAPY_PYTHON_SHELL')
         shells = []
         if env:
             shells += env.strip().lower().split(',')
         elif cfg.has_option(section, option):
             shells += [cfg.get(section, option).strip().lower()]
         else:  # try all by default
             shells += DEFAULT_PYTHON_SHELLS.keys()
         # always add standard shell as fallback
         shells += ['python']
         start_python_console(self.vars,
                              shells=shells,
                              banner=self.vars.pop('banner', ''))  #开启shell
示例#22
0
 def start(self,
           url=None,
           request=None,
           response=None,
           spider=None,
           redirect=True):
     # disable accidental Ctrl-C key press from shutting down the engine
     signal.signal(signal.SIGINT, signal.SIG_IGN)
     if url:
         self.fetch(url, spider, redirect=redirect)
     elif request:
         self.fetch(request, spider)
     elif response:
         request = response.request
         self.populate_vars(response, request, spider)
     else:
         self.populate_vars()
     if self.code:
         print(eval(self.code, globals(), self.vars))
     else:
         """
         Detect interactive shell setting in scrapy.cfg
         e.g.: ~/.config/scrapy.cfg or ~/.scrapy.cfg
         [settings]
         # shell can be one of ipython, bpython or python;
         # to be used as the interactive python console, if available.
         # (default is ipython, fallbacks in the order listed above)
         shell = python
         """
         cfg = get_config()
         section, option = 'settings', 'shell'
         env = os.environ.get('SCRAPY_PYTHON_SHELL')
         shells = []
         if env:
             shells += env.strip().lower().split(',')
         elif cfg.has_option(section, option):
             shells += [cfg.get(section, option).strip().lower()]
         else:  # try all by default
             shells += DEFAULT_PYTHON_SHELLS.keys()  # 获取所有的python交互类型
         # always add standard shell as fallback
         shells += ['python']  # 总是追加标准的python类型在末尾
         # 根据指定的shell类型,开启指定的shell界面
         start_python_console(self.vars,
                              shells=shells,
                              banner=self.vars.pop('banner', ''))
示例#23
0
def build_egg(project):
    path = get_run_path()
    path = '{path}/storage/{project}/'.format(path=path, project=project.name)
    os.chdir(path)
    settings = get_config().get('settings', 'default')
    _create_default_setup_py(settings=settings, project=project.name)
    d = tempfile.mkdtemp(prefix="scrapydeploy-")
    o = open(os.path.join(d, "stdout"), "wb")
    e = open(os.path.join(d, "stderr"), "wb")
    retry_on_eintr(
        check_call,
        [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d],
        stdout=o,
        stderr=e)
    o.close()
    e.close()
    egg = glob.glob(os.path.join(d, '*.egg'))[0]
    os.system('rm *.egg')
    shutil.move(egg, path)
    return find_egg(path)
示例#24
0
文件: shell.py 项目: runt18/scrapy
 def start(self, url=None, request=None, response=None, spider=None):
     # disable accidental Ctrl-C key press from shutting down the engine
     signal.signal(signal.SIGINT, signal.SIG_IGN)
     if url:
         self.fetch(url, spider)
     elif request:
         self.fetch(request, spider)
     elif response:
         request = response.request
         self.populate_vars(response, request, spider)
     else:
         self.populate_vars()
     if self.code:
         print(eval(self.code, globals(), self.vars))
     else:
         """
         Detect interactive shell setting in scrapy.cfg
         e.g.: ~/.config/scrapy.cfg or ~/.scrapy.cfg
         [settings]
         # shell can be one of ipython, bpython or python;
         # to be used as the interactive python console, if available.
         # (default is ipython, fallbacks in the order listed above)
         shell = python
         """
         cfg = get_config()
         section, option = 'settings', 'shell'
         env = os.environ.get('SCRAPY_PYTHON_SHELL')
         shells = []
         if env:
             shells += env.strip().lower().split(',')
         elif cfg.has_option(section, option):
             shells += [cfg.get(section, option).strip().lower()]
         else:  # try all by default
             shells += DEFAULT_PYTHON_SHELLS.keys()
         # always add standard shell as fallback
         shells += ['python']
         start_python_console(self.vars, shells=shells,
                              banner=self.vars.pop('banner', ''))
示例#25
0
def _get_option(section, option, default=None):
    cfg = get_config()
    return cfg.get(section, option) if cfg.has_option(section, option) \
        else default
示例#26
0
    address = StringUtils.getCrawlInfoAddress()
    url = settings.get("CRAWL_INFO_SCHEDULE_URL") % (address, "", "")
    jsonResult = HttpUtils.getUrl(url)
    list = jsonResult["data"]
    return list


def test(url, project):
    print("%s %s" % (url, project))
    # logger.info("%s %s" % (url,project))


if __name__ == '__main__':
    # logging.basicConfig(filename="/home/yhye/python_project/gemantic-python/crawl_selected/log.conf")
    # logger = logging.getLogger("main")
    cfg = get_config()
    url = cfg.get('deploy', "url")
    settings = get_project_settings()
    # project = settings.get("BOT_NAME")

    # logger.info("start schedule %s %s" % (url,project))
    now = datetime.datetime.now()
    # sys.stderr.write("%s start schedule %s %s" % (now,url, project))
    print("%s start schedule %s" % (now, url))
    # schedule.every().minute.do(test,url,project)
    crawlInfos = getCrawlInfo()
    print(crawlInfos)
    for crawlInfo in crawlInfos:
        scheduleType = crawlInfo["scheduleType"]
        scheduleEvery = crawlInfo["schedule"]
        project = crawlInfo["project"]
示例#27
0
class SentinelSpider(XMLFeedSpider):
    def __init__(self,
                 settings,
                 request_id=ID,
                 polygon=None,
                 begin_date=None,
                 end_date=None,
                 *args,
                 **kwargs):
        """construct with settings"""
        self.settings = settings
        self.logger.info("polygon %s", polygon)
        self.request_id = request_id

        # last 2 weeks by default
        if begin_date is None and end_date is None:
            # from globals
            begin_position = "%s TO %s" % (BEGIN_DATE, END_DATE)
            end_position = "%s TO %s" % (BEGIN_DATE, END_DATE)
        else:
            # arguments
            begin_position = "%s TO %s" % (begin_date, end_date)
            end_position = "%s TO %s" % (begin_date, end_date)

        # build the query
        query_parts = []
        if polygon:
            query_parts.append(
                'footprint:"Intersects({polygon})"'.format(polygon=polygon))
        if begin_position:
            query_parts.append('beginPosition:[{0}]'.format(begin_position))
        if end_position:
            query_parts.append('endPosition:[{0}]'.format(end_position))
        # build the start url
        query = " AND ".join(query_parts)
        self.start_urls = [
            'https://' + domain + '/dhus/api/search?' +
            urllib.urlencode({"q": query if query_parts else "*"})
        ]

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        """expose the settings"""
        settings = crawler.settings
        spider = cls(settings, *args, **kwargs)
        spider._set_crawler(crawler)
        return spider

    name = 'sentinel'
    allowed_domains = [domain]

    rules = [
        # Extract links matching 'dhus/api/search'
        # and follow links from them (since no callback means follow=True by default).
        # todo fix restrict_xpaths to allow for non-html links
        Rule(
            LxmlLinkExtractor(allow="/dhus/api/search",
                              tags="{http://www.w3.org/2005/Atom}link",
                              attrs=("href", )))
    ]

    # the iternode iterator does not work because the xml is using nested xml
    iterator = 'xml'
    itertag = 'atom:entry'
    namespaces = [('atom', 'http://www.w3.org/2005/Atom'),
                  ('opensearch', 'http://a9.com/-/spec/opensearch/1.1/')]
    # get passwords from config
    http_user = get_config().get(domain, 'username')
    http_pass = get_config().get(domain, 'password')

    def parse_nodes(self, response, nodes):
        """
        Inherited from XMLFeedSpider
        Extended to also return requests.
        """
        for selector in nodes:
            ret = iterate_spider_output(self.parse_node(response, selector))
            for result_item in self.process_results(response, ret):
                yield result_item
        seen = set()
        for i, rule in enumerate(self.rules):
            links = [
                l for l in rule.link_extractor.extract_links(response)
                if l not in seen
            ]
            self.logger.info('links %s', links)
            if links and rule.process_links:
                links = rule.process_links(links)
            for link in links:
                seen.add(link)
                r = Request(url=link.url)
                r.meta.update(rule=i, link_text=link.text)
                yield rule.process_request(r)

    def parse_node(self, response, selector):
        self.logger.info("selector %s", selector)
        l = ItemLoader(SatItem(), selector=selector, response=response)
        l.default_output_processor = TakeFirst()
        l.add_xpath("metadata", "atom:link[@rel='alternative']/@href")
        l.add_xpath("icon", "atom:link[@rel='icon']/@href")
        l.add_xpath("download", "atom:link/@href")
        l.add_xpath('footprint', "atom:str[@name='footprint']/text()")
        l.add_xpath('id', 'atom:id/text()')
        l.add_xpath('identifier', "atom:str[@name='identifier']/text()")
        l.add_value('requestId', self.request_id)
        i = l.load_item()
        return i
示例#28
0
def get_option(section: str, option: str, cfg=get_config()):
    '''Gets option from scrapy.cfg in project root'''
    return cfg.get(section, option) if cfg.has_option(section, option) else None
示例#29
0
文件: deploy.py 项目: chzealot/scrapy
def _get_option(section, option, default=None):
    cfg = get_config()
    return cfg.get(section, option) if cfg.has_option(section, option) \
        else default
示例#30
0
    "hurricane",
    "tsunami",
    "heavy rainfall",
    "overlast water",
    "overstroming",
    "inondation",
    "flut",
    "regenval",
    "hevige neerslag",
    "niederschlagsmenge",
    "dike breakthrough",
    "dijk doorbraak"
]

points = []
accesstoken = get_config().get('opencalais', 'token')


class newsspider(XMLFeedSpider):
    name = 'newsspider'
    start_urls = []
    for t in range(len(tags)):
        start_urls.append('http://emm.newsbrief.eu/rss?type=search&mode=advanced&atleast=' + tags[t])
    start_urls.append('http://feeds.feedburner.com/Floodlist')
    namespaces = [
        ("georss", "http://www.georss.org/georss")]

    def text_scan(self, description):
        text = description.split(" ")
        textlist = []
        for i in range(len(text)):
示例#31
0
import datetime as dt
import numpy as np

from scrapy.spiders import XMLFeedSpider, Spider, CrawlSpider, Rule
from scrapy.utils.conf import get_config

start_urls = []
# some tags used in floods
tags = [
    "flood", "hurricane", "tsunami", "heavy rainfall", "overlast water",
    "overstroming", "inondation", "flut", "regenval", "hevige neerslag",
    "niederschlagsmenge", "dike breakthrough", "dijk doorbraak"
]

points = []
accesstoken = get_config().get('opencalais', 'token')


class newsspider(XMLFeedSpider):
    name = 'newsspider'
    start_urls = []
    for t in range(len(tags)):
        start_urls.append(
            'http://emm.newsbrief.eu/rss?type=search&mode=advanced&atleast=' +
            tags[t])
    start_urls.append('http://feeds.feedburner.com/Floodlist')
    namespaces = [("georss", "http://www.georss.org/georss")]

    def text_scan(self, description):
        text = description.split(" ")
        textlist = []