def __init__(self): self.logger = logging.getLogger(self.__class__.__name__) self.logger.info("Constructing rabbitmq logger") username = get_config().get('rabbitmq', 'username') password = get_config().get('rabbitmq', 'password') credentials = pika.credentials.PlainCredentials( username=username, password=password ) host = get_config().get('rabbitmq', 'host') parameters = pika.ConnectionParameters( host=host, port=5672, virtual_host='/', credentials=credentials ) connection = pika.BlockingConnection( parameters=parameters ) channel = connection.channel() # we're publishing to two channels, the download request # so that a download queue can pick it up channel.queue_declare('crisis_download_requests') # and a fanout exchange to notify listeners that we've crawled something channel.exchange_declare( 'crisis_crawl', type='fanout', durable=True ) self.channel = channel
def __init__(self): self.logger = logging.getLogger(self.__class__.__name__) self.logger.info("Constructing rabbitmq logger") username = get_config().get('rabbitmq', 'username') password = get_config().get('rabbitmq', 'password') credentials = pika.credentials.PlainCredentials( username=username, password=password ) host = get_config().get('rabbitmq', 'host') parameters = pika.ConnectionParameters( host=host, port=5672, virtual_host='/', credentials=credentials ) connection = pika.BlockingConnection( parameters=parameters ) channel = connection.channel() # we're publishing to two channels, the download request # so that a download queue can pick it up channel.queue_declare('crisis_download_requests') # and a fanout exchange to notify listeners that we've crawled something channel.exchange_declare( 'crisis_crawl', type='fanout' ) self.channel = channel
def build(self): closest = closest_scrapy_cfg() os.chdir(os.path.dirname(closest)) if not os.path.exists('setup.py'): if not self.settings_name: self.settings_name = 'default' settings = get_config().get('settings', self.settings_name) self._create_default_setup_py(settings=settings) print 'build egg use [%s] settings' % self.settings_name if not self.file_path: self.file_path = tempfile.mkdtemp(prefix="scrapydeploy-") else: if not os.path.exists(self.file_path): os.mkdir(self.file_path) d = self.file_path o = open(os.path.join(d, "stdout"), "wb") e = open(os.path.join(d, "stderr"), "wb") retry_on_eintr( check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=o, stderr=e) o.close() e.close() egg = glob.glob(os.path.join(d, '*.egg'))[0] self._rename_egg_name(egg, d)
def getCrawlInfoAddress(cls): settings = get_project_settings() localIp = settings.get("CRAWL_IP", HttpUtils.get_ip_address()) cfg = get_config() scrapydUrl = cfg.get('deploy', "url").replace("http://", "").replace("/", "") address = scrapydUrl.replace("localhost", localIp) return address
def __init__(self, queuename="default"): self.queuename = queuename self.seen_queues = [] self.connection = pika.BlockingConnection( pika.ConnectionParameters(conf.get_config().get('RabbitConfig', 'host'))) self.channel = self.connection.channel() self.client_params = {"x-ha-policy": "all"} self.channel.queue_declare(queue=self.queuename, arguments=self.client_params) self.seen_queues.append(self.queuename)
def project_data_dir(project='default'): """Return the current project data dir, creating it if it doesn't exist""" assert inside_project(), "Not inside project" scrapy_cfg = closest_scrapy_cfg() d = abspath(join(dirname(scrapy_cfg), '.scrapy')) cfg = get_config() if cfg.has_option(DATADIR_CFG_SECTION, project): d = cfg.get(DATADIR_CFG_SECTION, project) if not exists(d): makedirs(d) return d
def _build_egg(): closest = closest_scrapy_cfg() os.chdir(os.path.dirname(closest)) if not os.path.exists("setup.py"): settings = get_config().get("settings", "default") _create_default_setup_py(settings=settings) d = tempfile.mkdtemp() f = tempfile.TemporaryFile(dir=d) check_call([sys.executable, "setup.py", "clean", "-a", "bdist_egg", "-d", d], stdout=f) egg = glob.glob(os.path.join(d, "*.egg"))[0] return egg, d
def _build_egg(): closest = closest_scrapy_cfg() os.chdir(os.path.dirname(closest)) if not os.path.exists('setup.py'): settings = get_config().get('settings', 'default') _create_default_setup_py(settings=settings) d = tempfile.mkdtemp() f = tempfile.TemporaryFile(dir=d) check_call([sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=f) egg = glob.glob(os.path.join(d, '*.egg'))[0] return egg, d
def _get_targets(): cfg = get_config() baset = dict(cfg.items('deploy')) if cfg.has_section('deploy') else {} baset.setdefault('url', 'http://dash.scrapinghub.com/api/scrapyd/') targets = {} targets['default'] = baset for x in cfg.sections(): if x.startswith('deploy:'): t = baset.copy() t.update(cfg.items(x)) targets[x[7:]] = t return targets
def _get_targets(): cfg = get_config() baset = dict(cfg.items('deploy')) if cfg.has_section('deploy') else {} targets = {} if 'url' in baset: targets['default'] = baset for x in cfg.sections(): if x.startswith('deploy:'): t = baset.copy() t.update(cfg.items(x)) targets[x[7:]] = t return targets
def _get_targets(): cfg = get_config() baset = dict(cfg.items("deploy")) if cfg.has_section("deploy") else {} targets = {} if "url" in baset: targets["default"] = baset for x in cfg.sections(): if x.startswith("deploy:"): t = baset.copy() t.update(cfg.items(x)) targets[x[7:]] = t return targets
def _read_scrapy_cfg_key(): try: from scrapy.utils.conf import get_config cfg = get_config() if cfg.has_section('deploy'): deploy = dict(cfg.items('deploy')) key = deploy.get('username') if key: return key except: return
def _build_egg(): closest = closest_scrapy_cfg() os.chdir(os.path.dirname(closest)) if not os.path.exists('setup.py'): settings = get_config().get('settings', 'default') _create_default_setup_py(settings=settings) d = tempfile.mkdtemp() f = tempfile.TemporaryFile(dir=d) check_call( [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=f) egg = glob.glob(os.path.join(d, '*.egg'))[0] return egg, d
def _build_egg(): closest = closest_scrapy_cfg() os.chdir(os.path.dirname(closest)) if not os.path.exists('setup.py'): settings = get_config().get('settings', 'default') _create_default_setup_py(settings=settings) d = tempfile.mkdtemp(prefix="scrapydeploy-") o = open(os.path.join(d, "stdout"), "wb") e = open(os.path.join(d, "stderr"), "wb") retry_on_eintr(check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=o, stderr=e) o.close() e.close() egg = glob.glob(os.path.join(d, '*.egg'))[0] return egg, d
def _build_egg(keep_build): closest = closest_scrapy_cfg() os.chdir(os.path.dirname(closest)) if not os.path.exists('setup.py'): settings = get_config().get('settings', 'default') _create_default_setup_py(settings=settings) d = tempfile.mkdtemp() f = tempfile.TemporaryFile(dir=d) retry_on_eintr(check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=f) egg = glob.glob(os.path.join(d, '*.egg'))[0] if not keep_build: shutil.rmtree('build') shutil.rmtree('project.egg-info') return egg, d
def project_data_dir(project='default'): """Return the current project data dir, creating it if it doesn't exist""" if not inside_project(): raise NotConfigured("Not inside a project") cfg = get_config() if cfg.has_option(DATADIR_CFG_SECTION, project): d = cfg.get(DATADIR_CFG_SECTION, project) else: scrapy_cfg = closest_scrapy_cfg() if not scrapy_cfg: raise NotConfigured("Unable to find scrapy.cfg file to infer project data dir") d = abspath(join(dirname(scrapy_cfg), '.scrapy')) if not exists(d): os.makedirs(d) return d
def _build_egg(): closest = closest_scrapy_cfg() os.chdir(os.path.dirname(closest)) if not os.path.exists('setup.py'): scrapy_project_settings = get_config() settings = scrapy_project_settings.get('settings', 'default') project = scrapy_project_settings.get('deploy', 'project') _create_default_setup_py(settings=settings, project=project) d = 'dist' retry_on_eintr( check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=sys.stdout, stderr=sys.stderr) egg = glob.glob(os.path.join(d, '*.egg'))[0] return egg, d
def start(self, url=None, request=None, response=None, spider=None, redirect=True): # disable accidental Ctrl-C key press from shutting down the engine signal.signal(signal.SIGINT, signal.SIG_IGN) if url: self.fetch(url, spider, redirect=redirect) #有URL就爬URL elif request: self.fetch(request, spider) # 有request 就爬request elif response: request = response.request self.populate_vars(response, request, spider) else: self.populate_vars() if self.code: print(eval( self.code, globals(), self.vars)) # eval就是将字符串 第一个参数是 字符串 第二个是globals 第三个是locals else: #这里 如果没有传入code的话 这里从 环境变量和scrapy.cfg文件中读取和拿到想要的Pythonshell """ Detect interactive shell setting in scrapy.cfg e.g.: ~/.config/scrapy.cfg or ~/.scrapy.cfg [settings] # shell can be one of ipython, bpython or python; # to be used as the interactive python console, if available. # (default is ipython, fallbacks in the order listed above) shell = python """ cfg = get_config() #scrapy.cfg的配置 section, option = 'settings', 'shell' env = os.environ.get('SCRAPY_PYTHON_SHELL') shells = [] if env: shells += env.strip().lower().split(',') elif cfg.has_option(section, option): shells += [cfg.get(section, option).strip().lower()] else: # try all by default shells += DEFAULT_PYTHON_SHELLS.keys() # always add standard shell as fallback shells += ['python'] start_python_console(self.vars, shells=shells, banner=self.vars.pop('banner', '')) #开启shell
def start(self, url=None, request=None, response=None, spider=None, redirect=True): # disable accidental Ctrl-C key press from shutting down the engine signal.signal(signal.SIGINT, signal.SIG_IGN) if url: self.fetch(url, spider, redirect=redirect) elif request: self.fetch(request, spider) elif response: request = response.request self.populate_vars(response, request, spider) else: self.populate_vars() if self.code: print(eval(self.code, globals(), self.vars)) else: """ Detect interactive shell setting in scrapy.cfg e.g.: ~/.config/scrapy.cfg or ~/.scrapy.cfg [settings] # shell can be one of ipython, bpython or python; # to be used as the interactive python console, if available. # (default is ipython, fallbacks in the order listed above) shell = python """ cfg = get_config() section, option = 'settings', 'shell' env = os.environ.get('SCRAPY_PYTHON_SHELL') shells = [] if env: shells += env.strip().lower().split(',') elif cfg.has_option(section, option): shells += [cfg.get(section, option).strip().lower()] else: # try all by default shells += DEFAULT_PYTHON_SHELLS.keys() # 获取所有的python交互类型 # always add standard shell as fallback shells += ['python'] # 总是追加标准的python类型在末尾 # 根据指定的shell类型,开启指定的shell界面 start_python_console(self.vars, shells=shells, banner=self.vars.pop('banner', ''))
def build_egg(project): path = get_run_path() path = '{path}/storage/{project}/'.format(path=path, project=project.name) os.chdir(path) settings = get_config().get('settings', 'default') _create_default_setup_py(settings=settings, project=project.name) d = tempfile.mkdtemp(prefix="scrapydeploy-") o = open(os.path.join(d, "stdout"), "wb") e = open(os.path.join(d, "stderr"), "wb") retry_on_eintr( check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=o, stderr=e) o.close() e.close() egg = glob.glob(os.path.join(d, '*.egg'))[0] os.system('rm *.egg') shutil.move(egg, path) return find_egg(path)
def start(self, url=None, request=None, response=None, spider=None): # disable accidental Ctrl-C key press from shutting down the engine signal.signal(signal.SIGINT, signal.SIG_IGN) if url: self.fetch(url, spider) elif request: self.fetch(request, spider) elif response: request = response.request self.populate_vars(response, request, spider) else: self.populate_vars() if self.code: print(eval(self.code, globals(), self.vars)) else: """ Detect interactive shell setting in scrapy.cfg e.g.: ~/.config/scrapy.cfg or ~/.scrapy.cfg [settings] # shell can be one of ipython, bpython or python; # to be used as the interactive python console, if available. # (default is ipython, fallbacks in the order listed above) shell = python """ cfg = get_config() section, option = 'settings', 'shell' env = os.environ.get('SCRAPY_PYTHON_SHELL') shells = [] if env: shells += env.strip().lower().split(',') elif cfg.has_option(section, option): shells += [cfg.get(section, option).strip().lower()] else: # try all by default shells += DEFAULT_PYTHON_SHELLS.keys() # always add standard shell as fallback shells += ['python'] start_python_console(self.vars, shells=shells, banner=self.vars.pop('banner', ''))
def _get_option(section, option, default=None): cfg = get_config() return cfg.get(section, option) if cfg.has_option(section, option) \ else default
address = StringUtils.getCrawlInfoAddress() url = settings.get("CRAWL_INFO_SCHEDULE_URL") % (address, "", "") jsonResult = HttpUtils.getUrl(url) list = jsonResult["data"] return list def test(url, project): print("%s %s" % (url, project)) # logger.info("%s %s" % (url,project)) if __name__ == '__main__': # logging.basicConfig(filename="/home/yhye/python_project/gemantic-python/crawl_selected/log.conf") # logger = logging.getLogger("main") cfg = get_config() url = cfg.get('deploy', "url") settings = get_project_settings() # project = settings.get("BOT_NAME") # logger.info("start schedule %s %s" % (url,project)) now = datetime.datetime.now() # sys.stderr.write("%s start schedule %s %s" % (now,url, project)) print("%s start schedule %s" % (now, url)) # schedule.every().minute.do(test,url,project) crawlInfos = getCrawlInfo() print(crawlInfos) for crawlInfo in crawlInfos: scheduleType = crawlInfo["scheduleType"] scheduleEvery = crawlInfo["schedule"] project = crawlInfo["project"]
class SentinelSpider(XMLFeedSpider): def __init__(self, settings, request_id=ID, polygon=None, begin_date=None, end_date=None, *args, **kwargs): """construct with settings""" self.settings = settings self.logger.info("polygon %s", polygon) self.request_id = request_id # last 2 weeks by default if begin_date is None and end_date is None: # from globals begin_position = "%s TO %s" % (BEGIN_DATE, END_DATE) end_position = "%s TO %s" % (BEGIN_DATE, END_DATE) else: # arguments begin_position = "%s TO %s" % (begin_date, end_date) end_position = "%s TO %s" % (begin_date, end_date) # build the query query_parts = [] if polygon: query_parts.append( 'footprint:"Intersects({polygon})"'.format(polygon=polygon)) if begin_position: query_parts.append('beginPosition:[{0}]'.format(begin_position)) if end_position: query_parts.append('endPosition:[{0}]'.format(end_position)) # build the start url query = " AND ".join(query_parts) self.start_urls = [ 'https://' + domain + '/dhus/api/search?' + urllib.urlencode({"q": query if query_parts else "*"}) ] @classmethod def from_crawler(cls, crawler, *args, **kwargs): """expose the settings""" settings = crawler.settings spider = cls(settings, *args, **kwargs) spider._set_crawler(crawler) return spider name = 'sentinel' allowed_domains = [domain] rules = [ # Extract links matching 'dhus/api/search' # and follow links from them (since no callback means follow=True by default). # todo fix restrict_xpaths to allow for non-html links Rule( LxmlLinkExtractor(allow="/dhus/api/search", tags="{http://www.w3.org/2005/Atom}link", attrs=("href", ))) ] # the iternode iterator does not work because the xml is using nested xml iterator = 'xml' itertag = 'atom:entry' namespaces = [('atom', 'http://www.w3.org/2005/Atom'), ('opensearch', 'http://a9.com/-/spec/opensearch/1.1/')] # get passwords from config http_user = get_config().get(domain, 'username') http_pass = get_config().get(domain, 'password') def parse_nodes(self, response, nodes): """ Inherited from XMLFeedSpider Extended to also return requests. """ for selector in nodes: ret = iterate_spider_output(self.parse_node(response, selector)) for result_item in self.process_results(response, ret): yield result_item seen = set() for i, rule in enumerate(self.rules): links = [ l for l in rule.link_extractor.extract_links(response) if l not in seen ] self.logger.info('links %s', links) if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = Request(url=link.url) r.meta.update(rule=i, link_text=link.text) yield rule.process_request(r) def parse_node(self, response, selector): self.logger.info("selector %s", selector) l = ItemLoader(SatItem(), selector=selector, response=response) l.default_output_processor = TakeFirst() l.add_xpath("metadata", "atom:link[@rel='alternative']/@href") l.add_xpath("icon", "atom:link[@rel='icon']/@href") l.add_xpath("download", "atom:link/@href") l.add_xpath('footprint', "atom:str[@name='footprint']/text()") l.add_xpath('id', 'atom:id/text()') l.add_xpath('identifier', "atom:str[@name='identifier']/text()") l.add_value('requestId', self.request_id) i = l.load_item() return i
def get_option(section: str, option: str, cfg=get_config()): '''Gets option from scrapy.cfg in project root''' return cfg.get(section, option) if cfg.has_option(section, option) else None
"hurricane", "tsunami", "heavy rainfall", "overlast water", "overstroming", "inondation", "flut", "regenval", "hevige neerslag", "niederschlagsmenge", "dike breakthrough", "dijk doorbraak" ] points = [] accesstoken = get_config().get('opencalais', 'token') class newsspider(XMLFeedSpider): name = 'newsspider' start_urls = [] for t in range(len(tags)): start_urls.append('http://emm.newsbrief.eu/rss?type=search&mode=advanced&atleast=' + tags[t]) start_urls.append('http://feeds.feedburner.com/Floodlist') namespaces = [ ("georss", "http://www.georss.org/georss")] def text_scan(self, description): text = description.split(" ") textlist = [] for i in range(len(text)):
import datetime as dt import numpy as np from scrapy.spiders import XMLFeedSpider, Spider, CrawlSpider, Rule from scrapy.utils.conf import get_config start_urls = [] # some tags used in floods tags = [ "flood", "hurricane", "tsunami", "heavy rainfall", "overlast water", "overstroming", "inondation", "flut", "regenval", "hevige neerslag", "niederschlagsmenge", "dike breakthrough", "dijk doorbraak" ] points = [] accesstoken = get_config().get('opencalais', 'token') class newsspider(XMLFeedSpider): name = 'newsspider' start_urls = [] for t in range(len(tags)): start_urls.append( 'http://emm.newsbrief.eu/rss?type=search&mode=advanced&atleast=' + tags[t]) start_urls.append('http://feeds.feedburner.com/Floodlist') namespaces = [("georss", "http://www.georss.org/georss")] def text_scan(self, description): text = description.split(" ") textlist = []