def from_crawler(cls, crawler): url = crawler.settings.get('SPLASH_URL', cls.SPLASH_DEFAULT_URL) directives_dir = crawler.settings.get('SPLASH_DIRECTIVES_DIR', cls.SPLASH_DEFAULT_DIRECTIVES_DIR) project_root = os.path.dirname(closest_scrapy_cfg()) directives_dir = os.path.join(project_root, directives_dir) return cls(crawler, url, directives_dir)
def test_find_scrapy_project_invalid_conf(self, workdir): config = closest_scrapy_cfg() with open(config, 'wb') as f: f.write(b'[other_section]') with pytest.raises(RuntimeError) as err: find_scrapy_project('default') assert str(err.value) == "No section: 'settings'"
def build_egg(project: str): '''Build egg in project root''' click.echo('Building egg...') closest = closest_scrapy_cfg() if closest == '': click.echo('No scrapy.cfg found') exit(1) directory = os.path.dirname(closest) os.chdir(directory) if not os.path.exists('setup.py'): click.echo('No setup.py in project') exit(1) d = tempfile.mkdtemp(prefix='scrapydeploy-') with open(os.path.join(d, 'stdout'), 'wb') as o, open(os.path.join(d, 'stderr'), 'wb') as e: p = [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d] retry_on_eintr(check_call, p, stdout=o, stderr=e) egg = glob.glob(os.path.join(d, '*.egg'))[0] filename = f'{project}.egg' shutil.copyfile(egg, filename) return f'{directory}/{filename}'
def get_project_dirs(): outer_dir = inner_dir = "" closest_cfg = closest_scrapy_cfg() if closest_cfg: outer_dir = os.path.dirname(closest_cfg) if os.environ.get('SCRAPY_PROJECT'): inner_dir = os.environ.get('SCRAPY_PROJECT') if outer_dir and inner_dir: return (outer_dir, inner_dir) init_env() scrapy_module = os.environ.get('SCRAPY_SETTINGS_MODULE') if scrapy_module is None and not outer_dir: raise Exception("Project configuration awry") if not inner_dir: inner_dir = scrapy_module.split('.')[0] if outer_dir and inner_dir: return (outer_dir, inner_dir) try: module = import_module(scrapy_module) outer_dir = os.path.dirname(os.path.dirname(module.__file__)) return (outer_dir, inner_dir) except ImportError: raise Exception("Project configuration awry")
def build(self): closest = closest_scrapy_cfg() os.chdir(os.path.dirname(closest)) if not os.path.exists('setup.py'): if not self.settings_name: self.settings_name = 'default' settings = get_config().get('settings', self.settings_name) self._create_default_setup_py(settings=settings) print 'build egg use [%s] settings' % self.settings_name if not self.file_path: self.file_path = tempfile.mkdtemp(prefix="scrapydeploy-") else: if not os.path.exists(self.file_path): os.mkdir(self.file_path) d = self.file_path o = open(os.path.join(d, "stdout"), "wb") e = open(os.path.join(d, "stderr"), "wb") retry_on_eintr( check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=o, stderr=e) o.close() e.close() egg = glob.glob(os.path.join(d, '*.egg'))[0] self._rename_egg_name(egg, d)
def _getsources(self): sources = ['/etc/scrapyd/scrapyd.conf', r'c:\scrapyd\scrapyd.conf'] sources += sorted(glob.glob('/etc/scrapyd/conf.d/*')) sources += ['scrapyd.conf'] scrapy_cfg = closest_scrapy_cfg() if scrapy_cfg: sources.append(scrapy_cfg) return sources
def get_prot_root(): '''get project root dir url Returns: proj_root_dir: project root dir ''' return os.path.dirname(closest_scrapy_cfg())
def get_project_root(): """ Returns the absolute path of the root of the project, and raise an exception if the current directory is not inside a project path """ os.path.abspath('.') if inside_project(): return os.path.dirname(closest_scrapy_cfg()) raise Exception(os.getcwd(), " does not belong to a Scrapy project")
def _delete_old_package(): cfg_path = closest_scrapy_cfg() root = os.path.dirname(cfg_path) build = root + r"\build" egg = root + r"\project.egg-info" if os.path.exists(build): util.delete_folder(build) if os.path.exists(egg): util.delete_folder(egg)
def inside_project(): scrapy_module = os.environ.get('SCRAPY_SETTINGS_MODULE') if scrapy_module is not None: try: import_module(scrapy_module) except ImportError as exc: warnings.warn("Cannot import scrapy settings module %s: %s" % (scrapy_module, exc)) else: return True return bool(closest_scrapy_cfg())
def _getsources(self): sources = ['/etc/scrapyd/scrapyd.conf', r'c:\scrapyd\scrapyd.conf'] sources += sorted(glob.glob('/etc/scrapyd/conf.d/*')) sources += ['scrapyd.conf'] env_source = os.environ.get('SCRAPYD_CONF') if env_source is not None: sources += [env_source] scrapy_cfg = closest_scrapy_cfg() if scrapy_cfg: sources.append(scrapy_cfg) return sources
def _build_egg(): closest = closest_scrapy_cfg() os.chdir(os.path.dirname(closest)) if not os.path.exists('setup.py'): settings = get_config().get('settings', 'default') _create_default_setup_py(settings=settings) d = tempfile.mkdtemp() f = tempfile.TemporaryFile(dir=d) check_call([sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=f) egg = glob.glob(os.path.join(d, '*.egg'))[0] return egg, d
def project_data_dir(project='default'): """Return the current project data dir, creating it if it doesn't exist""" assert inside_project(), "Not inside project" scrapy_cfg = closest_scrapy_cfg() d = abspath(join(dirname(scrapy_cfg), '.scrapy')) cfg = get_config() if cfg.has_option(DATADIR_CFG_SECTION, project): d = cfg.get(DATADIR_CFG_SECTION, project) if not exists(d): makedirs(d) return d
def _build_egg(): closest = closest_scrapy_cfg() os.chdir(os.path.dirname(closest)) if not os.path.exists("setup.py"): settings = get_config().get("settings", "default") _create_default_setup_py(settings=settings) d = tempfile.mkdtemp() f = tempfile.TemporaryFile(dir=d) check_call([sys.executable, "setup.py", "clean", "-a", "bdist_egg", "-d", d], stdout=f) egg = glob.glob(os.path.join(d, "*.egg"))[0] return egg, d
def _build_egg(): closest = closest_scrapy_cfg() os.chdir(os.path.dirname(closest)) if not os.path.exists('setup.py'): settings = get_config().get('settings', 'default') _create_default_setup_py(settings=settings) d = tempfile.mkdtemp() f = tempfile.TemporaryFile(dir=d) check_call( [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=f) egg = glob.glob(os.path.join(d, '*.egg'))[0] return egg, d
def _getsources(self): """ 获取配置文件 :return: """ sources = ['/etc/engine/engine.conf', r'c:\engine\engine.conf'] sources += sorted(glob.glob('/etc/engine/conf.d/*')) sources += ['engine.conf'] sources += [expanduser('~/.engine.conf')] scrapy_cfg = closest_scrapy_cfg() if scrapy_cfg: sources.append(scrapy_cfg) return sources
def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. apple_secure = '' root_path = '/'.join(closest_scrapy_cfg().split('/')[0:-1]) bot_path = root_path + '/' + crawler.settings['BOT_NAME'] + '/' config_path = crawler.settings['APPLE_SECURE'] if crawler.settings[ 'APPLE_SECURE'][ 0:1] == '/' else bot_path + crawler.settings['APPLE_SECURE'] with open(config_path) as f: apple_secure = f.read() s = cls(crawler.settings, apple_secure) crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s
def inside_project(): # 1. 该变量在get_project_settings中已经配置 scrapy_module = os.environ.get('SCRAPY_SETTINGS_MODULE') if scrapy_module is not None: try: import_module(scrapy_module) # 检查是否导入成功 except ImportError as exc: warnings.warn("Cannot import scrapy settings module %s: %s" % (scrapy_module, exc)) else: return True # 2. 如果没有找到, 则就近查找 return bool(closest_scrapy_cfg())
def _build_egg(keep_build): closest = closest_scrapy_cfg() os.chdir(os.path.dirname(closest)) if not os.path.exists('setup.py'): settings = get_config().get('settings', 'default') _create_default_setup_py(settings=settings) d = tempfile.mkdtemp() f = tempfile.TemporaryFile(dir=d) retry_on_eintr(check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=f) egg = glob.glob(os.path.join(d, '*.egg'))[0] if not keep_build: shutil.rmtree('build') shutil.rmtree('project.egg-info') return egg, d
def _build_egg(): closest = closest_scrapy_cfg() os.chdir(os.path.dirname(closest)) if not os.path.exists('setup.py'): settings = get_config().get('settings', 'default') _create_default_setup_py(settings=settings) d = tempfile.mkdtemp(prefix="scrapydeploy-") o = open(os.path.join(d, "stdout"), "wb") e = open(os.path.join(d, "stderr"), "wb") retry_on_eintr(check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=o, stderr=e) o.close() e.close() egg = glob.glob(os.path.join(d, '*.egg'))[0] return egg, d
def _settings_from_spider(cls, sp): logger = logging.getLogger(getattr(cls, "name") or __name__) # Given a class, workflow is following: # 1) get path to the file # 2) get global config locations from sconf.get_sources(False) # 3) search all of these for config files using sconf.closest_scrapy_cfg # 4) pass config filepaths to ConfigParser, with local overriding global # 5) search settings section of parsed config, looking for entry # whose name matches the name of the package of the class, or 'default' # entry if no other was found. This defines the module import path. # 6) Import the module spcls = sp if inspect.isclass(sp) else sp.__class__ spcls_name = spcls.__name__ spcls_path = inspect.getfile(spcls) global_cfg_files = sconf.get_sources(False) sp_cfg_file = sconf.closest_scrapy_cfg(spcls_path) cfg_files = [*global_cfg_files, sp_cfg_file] cfg = ConfigParser() cfg.read(cfg_files) # find the relevant entry in the respective config by iterating # over entries in config settings section, and find right entry # or default try: cfg_settings_sect = cfg['settings'] except KeyError: logger.debug('Skipping settings import from ancestor "{}". ' 'Cannot find scrapy.cfg file'.format(spcls_name)) return pkg = spcls.__module__.split('.')[0] sttngs_path = cfg_settings_sect.get('default') for prj_name, sttngs in cfg_settings_sect.items(): if prj_name == pkg: sttngs_path = sttngs break rel_sttngs_path = '.{}'.format(sttngs_path.split('.', 1)[1]) logger.debug('Importing settings file of ancestor "{}" ' 'from module "{}"'.format(spcls_name, sttngs_path)) try: settings_mdl = importlib.import_module(rel_sttngs_path, pkg) return settings_mdl except ModuleNotFoundError: logger.debug('Skipping settings import from ancestor "{}". ' 'Cannot find settings.py file'.format(spcls_name))
def find_scrapy_project(project): project_config_path = closest_scrapy_cfg() if not project_config_path: raise RuntimeError('Cannot find scrapy.cfg file') project_config = ConfigParser() project_config.read(project_config_path) try: project_settings = project_config.get('settings', project) except (NoSectionError, NoOptionError) as e: raise RuntimeError(e.message) if not project_settings: raise RuntimeError('Cannot find scrapy project settings') project_location = os.path.dirname(project_config_path) sys.path.append(project_location) return project_settings
def find_scrapy_project(project): project_config_path = closest_scrapy_cfg() if not project_config_path: raise RuntimeError('Cannot find scrapy.cfg file') project_config = SafeConfigParser() project_config.read(project_config_path) try: project_settings = project_config.get('settings', project) except (NoSectionError, NoOptionError) as e: raise RuntimeError(e.message) if not project_settings: raise RuntimeError('Cannot find scrapy project settings') project_location = os.path.dirname(project_config_path) sys.path.append(project_location) return project_settings
def inside_project(): ## 从系统环境变量中获取项目设置模块 'XXX.settings' scrapy_module = os.environ.get('SCRAPY_SETTINGS_MODULE') if scrapy_module is not None: try: ## 导入项目设置模块 import_module(scrapy_module) except ImportError as exc: warnings.warn("Cannot import scrapy settings module %s: %s" % (scrapy_module, exc)) else: return True ## 如果系统环境变量中没有,就近查找 scrapy.cfg,找得到就认为是在项目环境中 ## scrapy 命令有的是依赖项目运行的,有的命令则是全局的,不依赖项目的。这里 ## 主要通过就近查找 scrapy.cfg 文件来确定是否在项目环境中 return bool(closest_scrapy_cfg())
def get_project_dir(): closest_cfg = closest_scrapy_cfg() if closest_cfg: return Path(closest_cfg).parent init_env() scrapy_module = os.environ.get('SCRAPY_SETTINGS_MODULE') if scrapy_module is None: return None try: module = import_module(scrapy_module) return Path(module.__file__).parent.parent except ImportError: return None
def project_data_dir(project='default'): """Return the current project data dir, creating it if it doesn't exist""" if not inside_project(): raise NotConfigured("Not inside a project") cfg = get_config() if cfg.has_option(DATADIR_CFG_SECTION, project): d = cfg.get(DATADIR_CFG_SECTION, project) else: scrapy_cfg = closest_scrapy_cfg() if not scrapy_cfg: raise NotConfigured("Unable to find scrapy.cfg file to infer project data dir") d = abspath(join(dirname(scrapy_cfg), '.scrapy')) if not exists(d): os.makedirs(d) return d
def __init__(self, *args, **kwargs): super(SnapshotSpider, self).__init__(*args, **kwargs) if 'start_url' in kwargs: self.start_urls = [kwargs.get('start_url')] elif 'start_urls' in kwargs: self.start_urls = kwargs.get('start_urls').split(',') else: self.start_urls = [ 'https://w5.ab.ust.hk/wcq/cgi-bin/' ] now = datetime.now(tz=pytz.timezone('Hongkong')) minute = '00' if now.minute < 30 else '30' self.time = now.strftime('%Y-%m-%d %H:') + minute self.template = lxml.html.parse('template.html') proj_root = os.path.dirname(closest_scrapy_cfg()) # store the current snapshot at the directory self.snapshot_dir self.snapshot_dir = os.path.join(proj_root, 'current-snapshot') os.makedirs(f'{self.snapshot_dir}/subjects', exist_ok=True)
def _build_egg(): closest = closest_scrapy_cfg() os.chdir(os.path.dirname(closest)) if not os.path.exists('setup.py'): scrapy_project_settings = get_config() settings = scrapy_project_settings.get('settings', 'default') project = scrapy_project_settings.get('deploy', 'project') _create_default_setup_py(settings=settings, project=project) d = 'dist' retry_on_eintr( check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=sys.stdout, stderr=sys.stderr) egg = glob.glob(os.path.join(d, '*.egg'))[0] return egg, d
def add_schedule(): scrapy_cfg = closest_scrapy_cfg() cp = SafeConfigParser() if scrapy_cfg: cp.read(scrapy_cfg) parser = OptionParser(prog='scrapydd add_schedule') parser.add_option('-p', '--project', help='the project name') parser.add_option('-s', '--spider', help='the spider name') parser.add_option('-d', '--schedule', help='cron expression of schedule') parser.add_option('--host', help='the server address') opts, args = parser.parse_args() try: project = opts.project or cp.get('deploy', 'project') except Error: print 'Error: project is required' parser.print_help() return spider = opts.spider if spider is None: print 'Error: spider is required' parser.print_help() return try: host = opts.host or cp.get('deploy', 'url') except Error: print 'Error: host is required' parser.print_help() return schedule = opts.schedule if schedule is None: print 'Error: schedule is required' parser.print_help() return url = urlparse.urljoin(host, '/add_schedule.json') postdata = urllib.urlencode({ 'project': project, 'spider': spider, 'cron': schedule }) res = urllib2.urlopen(url, postdata) print res.read()
def find_scrapy_project(project, crawler="jianshu"): # project_config_path = closest_scrapy_cfg("./crawler/jianshu/") project_dir = "./crawler/{}/".format(crawler) print("project_dir======={}".format(project_dir)) project_config_path = closest_scrapy_cfg(project_dir) if not project_config_path: raise RuntimeError('Cannot find scrapy.cfg file') project_config = SafeConfigParser() project_config.read(project_config_path) try: project_settings = project_config.get('settings', project) except (NoSectionError, NoOptionError) as e: raise RuntimeError(e.message) if not project_settings: raise RuntimeError('Cannot find scrapy project settings') project_location = os.path.dirname(project_config_path) sys.path.append(project_location) return project_settings
class TruliaSpider(scrapy.Spider): name = 'sold_0' allowed_domains = ['trulia.com'] custom_settings = {'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/iterate/sold_%(start)s_%(time)s.jl'), 'FEED_FORMAT': 'jsonlines'} def __init__(self, state = 'IL', city = 'Chicago', start = 1, *args, **kwargs): super().__init__(*args, **kwargs) self.state = state self.city = city self.start = start self.start_urls = ['https://www.trulia.com/sold/{city},{state}/'.format(state=state, city=city)] self.le = LinkExtractor(allow=r'^https://www.trulia.com/p/') def parse(self, response): #N = 598 #trulia.TruliaSpider.get_number_of_pages_to_scrape(response) M = self.start N = M+49 self.logger.info("Seaching between index page {M} and index page {N} ".format(N=N,M=M)) for url in [response.urljoin("{n}_p/".format(n=n)) for n in range(M, N+1)]: yield scrapy.Request(url=url, callback=self.parse_index_page) def parse_index_page(self, response): for link in self.le.extract_links(response): yield scrapy.Request(url=link.url, callback=self.parse_property_page) def parse_property_page(self, response): item_loader = TruliaItemLoader(item=TruliaItem(), response=response) trulia.TruliaSpider.load_common_fields(item_loader=item_loader, response=response) details = item_loader.nested_css('.homeDetailsHeading') taxes = details.nested_xpath('.//*[text() = "Property Taxes and Assessment"]/parent::div') taxes.add_xpath('property_tax_assessment_year', './following-sibling::div/div[contains(text(), "Year")]/following-sibling::div/text()') taxes.add_xpath('property_tax', './following-sibling::div/div[contains(text(), "Tax")]/following-sibling::div/text()') taxes.add_xpath('property_tax_assessment_land', './following-sibling::div/div/div[contains(text(), "Land")]/following-sibling::div/text()') taxes.add_xpath('property_tax_assessment_improvements', './following-sibling::div/div/div[contains(text(), "Improvements")]/following-sibling::div/text()') taxes.add_xpath('property_tax_assessment_total', './following-sibling::div/div/div[contains(text(), "Total")]/following-sibling::div/text()') taxes.add_xpath('property_tax_market_value', './following-sibling::div/div[contains(text(), "Market Value")]/following-sibling::div/text()') item = item_loader.load_item() trulia.TruliaSpider.post_process(item=item) return item
class TruliaSpider(scrapy.Spider): name = 'trulia_sold' allowed_domains = ['trulia.com'] custom_settings = {'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/data_sold_%(state)s_%(city)s_%(time)s.jl'), 'FEED_FORMAT': 'jsonlines'} def __init__(self, state='CA', city='San_Francisco', *args, **kwargs): super().__init__(*args, **kwargs) self.state = state self.city = city self.start_urls = ['http://trulia.com/sold/{city},{state}/'.format(state=state, city=city)] self.le = LinkExtractor(allow=r'^https://www.trulia.com/homes/.+/sold/') def parse(self, response): N = trulia.TruliaSpider.get_number_of_pages_to_scrape(response) self.logger.info("Determined that property pages are contained on {N} different index pages, each containing at most 30 properties. Proceeding to scrape each index page...".format(N=N)) for url in [response.urljoin("{n}_p/".format(n=n)) for n in range(1, N+1)]: yield scrapy.Request(url=url, callback=self.parse_index_page) def parse_index_page(self, response): for link in self.le.extract_links(response): yield scrapy.Request(url=link.url, callback=self.parse_property_page) def parse_property_page(self, response): item_loader = TruliaItemLoader(item=TruliaItem(), response=response) trulia.TruliaSpider.load_common_fields(item_loader=item_loader, response=response) details = item_loader.nested_css('.homeDetailsHeading') taxes = details.nested_xpath('.//*[text() = "Property Taxes and Assessment"]/parent::div') taxes.add_xpath('property_tax_assessment_year', './following-sibling::div/div[contains(text(), "Year")]/following-sibling::div/text()') taxes.add_xpath('property_tax', './following-sibling::div/div[contains(text(), "Tax")]/following-sibling::div/text()') taxes.add_xpath('property_tax_assessment_land', './following-sibling::div/div/div[contains(text(), "Land")]/following-sibling::div/text()') taxes.add_xpath('property_tax_assessment_improvements', './following-sibling::div/div/div[contains(text(), "Improvements")]/following-sibling::div/text()') taxes.add_xpath('property_tax_assessment_total', './following-sibling::div/div/div[contains(text(), "Total")]/following-sibling::div/text()') taxes.add_xpath('property_tax_market_value', './following-sibling::div/div[contains(text(), "Market Value")]/following-sibling::div/text()') item = item_loader.load_item() trulia.TruliaSpider.post_process(item=item) return item
def inside_project(): return bool(closest_scrapy_cfg())
class TruliaSpider(scrapy.Spider): name = 'sale_0' allowed_domains = ['trulia.com'] custom_settings = { 'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/iterate/sale_%(start)s_%(time)s.jl'), 'FEED_FORMAT': 'jsonlines' } def __init__(self, state='IL', city='Chicago', start=1, *args, **kwargs): super().__init__(*args, **kwargs) self.state = state self.city = city self.start = start self.start_urls = [ 'https://www.trulia.com/{state}/{city}'.format(state=state, city=city) ] self.le = LinkExtractor(allow=r'^https://www.trulia.com/p/') def parse(self, response): #N = self.get_number_of_pages_to_scrape(response) M = self.start N = M + 49 #self.stop self.logger.info( "Seaching between index page {M} and index page {N} ".format(N=N, M=M)) for url in [ response.urljoin("{n}_p/".format(n=n)) for n in range(M, N + 1) ]: yield scrapy.Request(url=url, callback=self.parse_index_page) @staticmethod def get_number_of_pages_to_scrape(response): #pagination = response.css('.paginationContainer').xpath('.//*/text()[contains(., "Results")]') #number_of_results = int(pagination.re_first(r'^1 - 30 of ([\d,]+) Results$').replace(',', '')) number_of_results = 249 return math.ceil(number_of_results / 30) def parse_index_page(self, response): for link in self.le.extract_links(response): yield scrapy.Request(url=link.url, callback=self.parse_property_page) def parse_property_page(self, response): l = TruliaItemLoader(item=TruliaItem(), response=response) self.load_common_fields(item_loader=l, response=response) listing_information = l.nested_xpath( '//span[text() = "LISTING INFORMATION"]') listing_information.add_xpath( 'listing_information', './parent::div/following-sibling::ul[1]/li/text()') listing_information.add_xpath('listing_information_date_updated', './following-sibling::span/text()', re=r'^Updated: (.*)') public_records = l.nested_xpath('//span[text() = "PUBLIC RECORDS"]') public_records.add_xpath( 'public_records', './parent::div/following-sibling::ul[1]/li/text()') public_records.add_xpath('public_records_date_updated', './following-sibling::span/text()', re=r'^Updated: (.*)') item = l.load_item() self.post_process(item=item) return item @staticmethod def load_common_fields(item_loader, response): '''Load field values which are common to "on sale" and "recently sold" properties.''' item_loader.add_value('url', response.url) item_loader.add_xpath('address', '//*[@data-role="address"]/text()') item_loader.add_xpath('city_state', '//*[@data-role="cityState"]/text()') item_loader.add_xpath('price', '//span[@data-role="price"]/text()', re=r'\$([\d,]+)') item_loader.add_xpath( 'neighborhood', '//*[@data-role="cityState"]/parent::h1/following-sibling::span/a/text()' ) details = item_loader.nested_css('.homeDetailsHeading') overview = details.nested_xpath( './/span[contains(text(), "Overview")]/parent::div/following-sibling::div[1]' ) overview.add_xpath('overview', xpath='.//li/text()') overview.add_xpath('area', xpath='.//li/text()', re=r'([\d,]+) sqft$') overview.add_xpath('lot_size', xpath='.//li/text()', re=r'([\d,.]+) (?:acres|sqft) lot size$') overview.add_xpath('lot_size_units', xpath='.//li/text()', re=r'[\d,.]+ (acres|sqft) lot size$') overview.add_xpath('price_per_square_foot', xpath='.//li/text()', re=r'\$([\d,.]+)/sqft$') overview.add_xpath('bedrooms', xpath='.//li/text()', re=r'(\d+) (?:Beds|Bed|beds|bed)$') overview.add_xpath('bathrooms', xpath='.//li/text()', re=r'(\d+) (?:Baths|Bath|baths|bath)$') overview.add_xpath('year_built', xpath='.//li/text()', re=r'Built in (\d+)') overview.add_xpath('days_on_Trulia', xpath='.//li/text()', re=r'([\d,]) days on Trulia$') overview.add_xpath('views', xpath='.//li/text()', re=r'([\d,]+) views$') #item_loader.add_css('description', '#descriptionContainer *::text') price_events = details.nested_xpath( './/*[text() = "Price History"]/parent::*/following-sibling::*[1]/div/div' ) price_events.add_xpath('prices', './div[contains(text(), "$")]/text()') price_events.add_xpath( 'dates', './div[contains(text(), "$")]/preceding-sibling::div/text()') price_events.add_xpath( 'events', './div[contains(text(), "$")]/following-sibling::div/text()') @staticmethod def post_process(item): '''Add any additional data to an item after loading it''' if item.get('dates') is not None: dates = [ datetime.datetime.strptime(date, '%m/%d/%Y') for date in item['dates'] ] prices = [ int(price.lstrip('$').replace(',', '')) for price in item['prices'] ] item['price_history'] = sorted(list( zip(dates, prices, item['events'])), key=lambda x: x[0])
def project_root(): return os.path.dirname(closest_scrapy_cfg())
# -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy.utils.conf import closest_scrapy_cfg import csv import random import os proj_root = os.path.dirname(closest_scrapy_cfg()) basepath = proj_root + "/site_scraper/" #path to folder containing "spiders" directory spiderpath = basepath + "spiders/" urlfile = open(spiderpath + "urls.csv", 'r') domainfile = open(spiderpath + "domains.csv", 'r') r1 = csv.reader(urlfile) next(r1) r2 = csv.reader(domainfile) next(r2) urls = [row[0] for row in r1] domains = [row[0] for row in r2] counter = 0 class TdSpider(CrawlSpider): name = 'td' allowed_domains = domains start_urls = urls rules = (Rule(LinkExtractor(), callback="parse_obj", follow=True), )
# -*- coding: utf-8 -*- import os from scrapy.utils.conf import closest_scrapy_cfg closest = closest_scrapy_cfg() assert closest # 获得项目的绝对目录 projdir = os.path.dirname(closest) BOT_NAME = 'Crawler' SPIDER_MODULES = ['Crawler.spiders'] NEWSPIDER_MODULE = 'Crawler.spiders' # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 16 # Maximum number of concurrent items (per response) to process in parallel in the Item Processor CONCURRENT_ITEMS = 1000 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN=16 # CONCURRENT_REQUESTS_PER_DOMAIN = 5 # CONCURRENT_REQUESTS_PER_IP = 6 # CONCURRENT_REQUESTS_PER_IP=16 # The maximum depth that will be allowed to crawl for any site. If zero, no limit will be imposed. DEPTH_LIMIT = 0 # An integer that is used to adjust the request priority based on its depth.If zero, no priority adjustment is made from depth. DEPTH_PRIORITY = 0
def __init__(self, settings): self.scopes = settings['GOOGLE_AUTH_SCOPES'] root_path = '/'.join(closest_scrapy_cfg().split('/')[0:-1]) bot_path = root_path + '/' + settings['BOT_NAME'] + '/' self.credentials = settings['GOOGLE_AUTH_CREDENTIAL_PATH'] if settings['GOOGLE_AUTH_CREDENTIAL_PATH'][0:1] == '/' else bot_path + settings['GOOGLE_AUTH_CREDENTIAL_PATH']
def __init__(self, settings): self.settings = settings self.scrapy_module_path = os.path.dirname(closest_scrapy_cfg()) self.spider_modules = settings.getlist('SPIDER_MODULES')