示例#1
0
 def from_crawler(cls, crawler):
     url = crawler.settings.get('SPLASH_URL', cls.SPLASH_DEFAULT_URL)
     directives_dir = crawler.settings.get('SPLASH_DIRECTIVES_DIR',
                                           cls.SPLASH_DEFAULT_DIRECTIVES_DIR)
     project_root = os.path.dirname(closest_scrapy_cfg())
     directives_dir = os.path.join(project_root, directives_dir)
     return cls(crawler, url, directives_dir)
 def test_find_scrapy_project_invalid_conf(self, workdir):
     config = closest_scrapy_cfg()
     with open(config, 'wb') as f:
         f.write(b'[other_section]')
     with pytest.raises(RuntimeError) as err:
         find_scrapy_project('default')
     assert str(err.value) == "No section: 'settings'"
示例#3
0
def build_egg(project: str):
    '''Build egg in project root'''

    click.echo('Building egg...')
    closest = closest_scrapy_cfg()

    if closest == '':
        click.echo('No scrapy.cfg found')
        exit(1)

    directory = os.path.dirname(closest)
    os.chdir(directory)

    if not os.path.exists('setup.py'):
        click.echo('No setup.py in project')
        exit(1)

    d = tempfile.mkdtemp(prefix='scrapydeploy-')

    with open(os.path.join(d, 'stdout'), 'wb') as o, open(os.path.join(d, 'stderr'), 'wb') as e:
        p = [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d]
        retry_on_eintr(check_call, p, stdout=o, stderr=e)

    egg = glob.glob(os.path.join(d, '*.egg'))[0]
    filename = f'{project}.egg'
    shutil.copyfile(egg, filename)
    return f'{directory}/{filename}'
示例#4
0
def get_project_dirs():
    outer_dir = inner_dir = ""
    closest_cfg = closest_scrapy_cfg()
    if closest_cfg:
        outer_dir = os.path.dirname(closest_cfg)
    if os.environ.get('SCRAPY_PROJECT'):
        inner_dir = os.environ.get('SCRAPY_PROJECT')
    if outer_dir and inner_dir:
        return (outer_dir, inner_dir)

    init_env()
    scrapy_module = os.environ.get('SCRAPY_SETTINGS_MODULE')
    if scrapy_module is None and not outer_dir:
        raise Exception("Project configuration awry")
    if not inner_dir:
        inner_dir = scrapy_module.split('.')[0]
    if outer_dir and inner_dir:
        return (outer_dir, inner_dir)

    try:
        module = import_module(scrapy_module)
        outer_dir = os.path.dirname(os.path.dirname(module.__file__))
        return (outer_dir, inner_dir)
    except ImportError:
        raise Exception("Project configuration awry")
示例#5
0
 def build(self):
     closest = closest_scrapy_cfg()
     os.chdir(os.path.dirname(closest))
     if not os.path.exists('setup.py'):
         if not self.settings_name:
             self.settings_name = 'default'
         settings = get_config().get('settings', self.settings_name)
         self._create_default_setup_py(settings=settings)
         print 'build egg use [%s] settings' % self.settings_name
     if not self.file_path:
         self.file_path = tempfile.mkdtemp(prefix="scrapydeploy-")
     else:
         if not os.path.exists(self.file_path):
             os.mkdir(self.file_path)
     d = self.file_path
     o = open(os.path.join(d, "stdout"), "wb")
     e = open(os.path.join(d, "stderr"), "wb")
     retry_on_eintr(
         check_call,
         [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d],
         stdout=o,
         stderr=e)
     o.close()
     e.close()
     egg = glob.glob(os.path.join(d, '*.egg'))[0]
     self._rename_egg_name(egg, d)
示例#6
0
文件: config.py 项目: lygntx/scrapyc
 def _getsources(self):
     sources = ['/etc/scrapyd/scrapyd.conf', r'c:\scrapyd\scrapyd.conf']
     sources += sorted(glob.glob('/etc/scrapyd/conf.d/*'))
     sources += ['scrapyd.conf']
     scrapy_cfg = closest_scrapy_cfg()
     if scrapy_cfg:
         sources.append(scrapy_cfg)
     return sources
示例#7
0
文件: config.py 项目: jdb613/tracking
 def _getsources(self):
     sources = ['/etc/scrapyd/scrapyd.conf', r'c:\scrapyd\scrapyd.conf']
     sources += sorted(glob.glob('/etc/scrapyd/conf.d/*'))
     sources += ['scrapyd.conf']
     scrapy_cfg = closest_scrapy_cfg()
     if scrapy_cfg:
         sources.append(scrapy_cfg)
     return sources
示例#8
0
文件: tool.py 项目: csJd/zhihu_text
def get_prot_root():
    '''get project root dir url

    Returns:
        proj_root_dir: project root dir

    '''
    return os.path.dirname(closest_scrapy_cfg())
示例#9
0
def get_project_root():
    """
    Returns the absolute path of the root of the project, and raise an exception
    if the current directory is not inside a project path
    """
    os.path.abspath('.')
    if inside_project():
        return os.path.dirname(closest_scrapy_cfg())
    raise Exception(os.getcwd(), " does not belong to a Scrapy project")
示例#10
0
def _delete_old_package():
    cfg_path = closest_scrapy_cfg()
    root = os.path.dirname(cfg_path)
    build = root + r"\build"
    egg = root + r"\project.egg-info"

    if os.path.exists(build):
        util.delete_folder(build)
    if os.path.exists(egg):
        util.delete_folder(egg)
示例#11
0
def inside_project():
    scrapy_module = os.environ.get('SCRAPY_SETTINGS_MODULE')
    if scrapy_module is not None:
        try:
            import_module(scrapy_module)
        except ImportError as exc:
            warnings.warn("Cannot import scrapy settings module %s: %s" % (scrapy_module, exc))
        else:
            return True
    return bool(closest_scrapy_cfg())
示例#12
0
 def _getsources(self):
     sources = ['/etc/scrapyd/scrapyd.conf', r'c:\scrapyd\scrapyd.conf']
     sources += sorted(glob.glob('/etc/scrapyd/conf.d/*'))
     sources += ['scrapyd.conf']
     env_source = os.environ.get('SCRAPYD_CONF')
     if env_source is not None:
         sources += [env_source]
     scrapy_cfg = closest_scrapy_cfg()
     if scrapy_cfg:
         sources.append(scrapy_cfg)
     return sources
示例#13
0
文件: deploy.py 项目: chzealot/scrapy
def _build_egg():
    closest = closest_scrapy_cfg()
    os.chdir(os.path.dirname(closest))
    if not os.path.exists('setup.py'):
        settings = get_config().get('settings', 'default')
        _create_default_setup_py(settings=settings)
    d = tempfile.mkdtemp()
    f = tempfile.TemporaryFile(dir=d)
    check_call([sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=f)
    egg = glob.glob(os.path.join(d, '*.egg'))[0]
    return egg, d
示例#14
0
def project_data_dir(project='default'):
    """Return the current project data dir, creating it if it doesn't exist"""
    assert inside_project(), "Not inside project"
    scrapy_cfg = closest_scrapy_cfg()
    d = abspath(join(dirname(scrapy_cfg), '.scrapy'))
    cfg = get_config()
    if cfg.has_option(DATADIR_CFG_SECTION, project):
        d = cfg.get(DATADIR_CFG_SECTION, project)
    if not exists(d):
        makedirs(d)
    return d
示例#15
0
文件: deploy.py 项目: robyoung/scrapy
def _build_egg():
    closest = closest_scrapy_cfg()
    os.chdir(os.path.dirname(closest))
    if not os.path.exists("setup.py"):
        settings = get_config().get("settings", "default")
        _create_default_setup_py(settings=settings)
    d = tempfile.mkdtemp()
    f = tempfile.TemporaryFile(dir=d)
    check_call([sys.executable, "setup.py", "clean", "-a", "bdist_egg", "-d", d], stdout=f)
    egg = glob.glob(os.path.join(d, "*.egg"))[0]
    return egg, d
示例#16
0
def _build_egg():
    closest = closest_scrapy_cfg()
    os.chdir(os.path.dirname(closest))
    if not os.path.exists('setup.py'):
        settings = get_config().get('settings', 'default')
        _create_default_setup_py(settings=settings)
    d = tempfile.mkdtemp()
    f = tempfile.TemporaryFile(dir=d)
    check_call(
        [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d],
        stdout=f)
    egg = glob.glob(os.path.join(d, '*.egg'))[0]
    return egg, d
示例#17
0
 def _getsources(self):
     """
     获取配置文件
     :return:
     """
     sources = ['/etc/engine/engine.conf', r'c:\engine\engine.conf']
     sources += sorted(glob.glob('/etc/engine/conf.d/*'))
     sources += ['engine.conf']
     sources += [expanduser('~/.engine.conf')]
     scrapy_cfg = closest_scrapy_cfg()
     if scrapy_cfg:
         sources.append(scrapy_cfg)
     return sources
示例#18
0
 def from_crawler(cls, crawler):
     # This method is used by Scrapy to create your spiders.
     apple_secure = ''
     root_path = '/'.join(closest_scrapy_cfg().split('/')[0:-1])
     bot_path = root_path + '/' + crawler.settings['BOT_NAME'] + '/'
     config_path = crawler.settings['APPLE_SECURE'] if crawler.settings[
         'APPLE_SECURE'][
             0:1] == '/' else bot_path + crawler.settings['APPLE_SECURE']
     with open(config_path) as f:
         apple_secure = f.read()
     s = cls(crawler.settings, apple_secure)
     crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
     return s
示例#19
0
def inside_project():
    # 1. 该变量在get_project_settings中已经配置
    scrapy_module = os.environ.get('SCRAPY_SETTINGS_MODULE')
    if scrapy_module is not None:
        try:
            import_module(scrapy_module)  # 检查是否导入成功
        except ImportError as exc:
            warnings.warn("Cannot import scrapy settings module %s: %s" %
                          (scrapy_module, exc))
        else:
            return True
    # 2. 如果没有找到, 则就近查找
    return bool(closest_scrapy_cfg())
示例#20
0
def _build_egg(keep_build):
    closest = closest_scrapy_cfg()
    os.chdir(os.path.dirname(closest))
    if not os.path.exists('setup.py'):
        settings = get_config().get('settings', 'default')
        _create_default_setup_py(settings=settings)
    d = tempfile.mkdtemp()
    f = tempfile.TemporaryFile(dir=d)
    retry_on_eintr(check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=f)
    egg = glob.glob(os.path.join(d, '*.egg'))[0]
    if not keep_build:
        shutil.rmtree('build')
        shutil.rmtree('project.egg-info')
    return egg, d
示例#21
0
def _build_egg():
    closest = closest_scrapy_cfg()
    os.chdir(os.path.dirname(closest))
    if not os.path.exists('setup.py'):
        settings = get_config().get('settings', 'default')
        _create_default_setup_py(settings=settings)
    d = tempfile.mkdtemp(prefix="scrapydeploy-")
    o = open(os.path.join(d, "stdout"), "wb")
    e = open(os.path.join(d, "stderr"), "wb")
    retry_on_eintr(check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=o, stderr=e)
    o.close()
    e.close()
    egg = glob.glob(os.path.join(d, '*.egg'))[0]
    return egg, d
示例#22
0
def _build_egg():
    closest = closest_scrapy_cfg()
    os.chdir(os.path.dirname(closest))
    if not os.path.exists('setup.py'):
        settings = get_config().get('settings', 'default')
        _create_default_setup_py(settings=settings)
    d = tempfile.mkdtemp(prefix="scrapydeploy-")
    o = open(os.path.join(d, "stdout"), "wb")
    e = open(os.path.join(d, "stderr"), "wb")
    retry_on_eintr(check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], stdout=o, stderr=e)
    o.close()
    e.close()
    egg = glob.glob(os.path.join(d, '*.egg'))[0]
    return egg, d
示例#23
0
    def _settings_from_spider(cls, sp):
        logger = logging.getLogger(getattr(cls, "name") or __name__)
        # Given a class, workflow is following:
        # 1) get path to the file
        # 2) get global config locations from sconf.get_sources(False)
        # 3) search all of these for config files using sconf.closest_scrapy_cfg
        # 4) pass config filepaths to ConfigParser, with local overriding global
        # 5) search settings section of parsed config, looking for entry
        #    whose name matches the name of the package of the class, or 'default'
        #    entry if no other was found. This defines the module import path.
        # 6) Import the module

        spcls = sp if inspect.isclass(sp) else sp.__class__
        spcls_name = spcls.__name__
        spcls_path = inspect.getfile(spcls)

        global_cfg_files = sconf.get_sources(False)
        sp_cfg_file = sconf.closest_scrapy_cfg(spcls_path)
        cfg_files = [*global_cfg_files, sp_cfg_file]
        cfg = ConfigParser()
        cfg.read(cfg_files)

        # find the relevant entry in the respective config by iterating
        # over entries in config settings section, and find right entry
        # or default
        try:
            cfg_settings_sect = cfg['settings']
        except KeyError:
            logger.debug('Skipping settings import from ancestor "{}". '
                         'Cannot find scrapy.cfg file'.format(spcls_name))
            return

        pkg = spcls.__module__.split('.')[0]
        sttngs_path = cfg_settings_sect.get('default')

        for prj_name, sttngs in cfg_settings_sect.items():
            if prj_name == pkg:
                sttngs_path = sttngs
                break

        rel_sttngs_path = '.{}'.format(sttngs_path.split('.', 1)[1])
        logger.debug('Importing settings file of ancestor "{}" '
                     'from module "{}"'.format(spcls_name, sttngs_path))
        try:
            settings_mdl = importlib.import_module(rel_sttngs_path, pkg)
            return settings_mdl
        except ModuleNotFoundError:
            logger.debug('Skipping settings import from ancestor "{}". '
                         'Cannot find settings.py file'.format(spcls_name))
示例#24
0
def find_scrapy_project(project):
    project_config_path = closest_scrapy_cfg()
    if not project_config_path:
        raise RuntimeError('Cannot find scrapy.cfg file')
    project_config = ConfigParser()
    project_config.read(project_config_path)
    try:
        project_settings = project_config.get('settings', project)
    except (NoSectionError, NoOptionError) as e:
        raise RuntimeError(e.message)
    if not project_settings:
        raise RuntimeError('Cannot find scrapy project settings')
    project_location = os.path.dirname(project_config_path)
    sys.path.append(project_location)
    return project_settings
示例#25
0
def find_scrapy_project(project):
    project_config_path = closest_scrapy_cfg()
    if not project_config_path:
        raise RuntimeError('Cannot find scrapy.cfg file')
    project_config = SafeConfigParser()
    project_config.read(project_config_path)
    try:
        project_settings = project_config.get('settings', project)
    except (NoSectionError, NoOptionError) as e:
        raise RuntimeError(e.message)
    if not project_settings:
        raise RuntimeError('Cannot find scrapy project settings')
    project_location = os.path.dirname(project_config_path)
    sys.path.append(project_location)
    return project_settings
示例#26
0
def inside_project():
    ## 从系统环境变量中获取项目设置模块 'XXX.settings'
    scrapy_module = os.environ.get('SCRAPY_SETTINGS_MODULE')
    if scrapy_module is not None:
        try:
            ## 导入项目设置模块
            import_module(scrapy_module)
        except ImportError as exc:
            warnings.warn("Cannot import scrapy settings module %s: %s" % (scrapy_module, exc))
        else:
            return True
    ## 如果系统环境变量中没有,就近查找 scrapy.cfg,找得到就认为是在项目环境中
    ## scrapy 命令有的是依赖项目运行的,有的命令则是全局的,不依赖项目的。这里
    ## 主要通过就近查找 scrapy.cfg 文件来确定是否在项目环境中
    return bool(closest_scrapy_cfg())
示例#27
0
def get_project_dir():
    closest_cfg = closest_scrapy_cfg()
    if closest_cfg:
        return Path(closest_cfg).parent

    init_env()
    scrapy_module = os.environ.get('SCRAPY_SETTINGS_MODULE')
    if scrapy_module is None:
        return None

    try:
        module = import_module(scrapy_module)
        return Path(module.__file__).parent.parent
    except ImportError:
        return None
示例#28
0
def project_data_dir(project='default'):
    """Return the current project data dir, creating it if it doesn't exist"""
    if not inside_project():
        raise NotConfigured("Not inside a project")
    cfg = get_config()
    if cfg.has_option(DATADIR_CFG_SECTION, project):
        d = cfg.get(DATADIR_CFG_SECTION, project)
    else:
        scrapy_cfg = closest_scrapy_cfg()
        if not scrapy_cfg:
            raise NotConfigured("Unable to find scrapy.cfg file to infer project data dir")
        d = abspath(join(dirname(scrapy_cfg), '.scrapy'))
    if not exists(d):
        os.makedirs(d)
    return d
 def __init__(self, *args, **kwargs):
     super(SnapshotSpider, self).__init__(*args, **kwargs)
     if 'start_url' in kwargs:
         self.start_urls = [kwargs.get('start_url')]
     elif 'start_urls' in kwargs:
         self.start_urls = kwargs.get('start_urls').split(',')
     else:
         self.start_urls = [ 'https://w5.ab.ust.hk/wcq/cgi-bin/' ]
     now = datetime.now(tz=pytz.timezone('Hongkong'))
     minute = '00' if now.minute < 30 else '30'
     self.time = now.strftime('%Y-%m-%d %H:') + minute
     self.template = lxml.html.parse('template.html')
     proj_root = os.path.dirname(closest_scrapy_cfg())
     # store the current snapshot at the directory self.snapshot_dir
     self.snapshot_dir = os.path.join(proj_root, 'current-snapshot')
     os.makedirs(f'{self.snapshot_dir}/subjects', exist_ok=True)
示例#30
0
def _build_egg():
    closest = closest_scrapy_cfg()
    os.chdir(os.path.dirname(closest))
    if not os.path.exists('setup.py'):
        scrapy_project_settings = get_config()
        settings = scrapy_project_settings.get('settings', 'default')
        project = scrapy_project_settings.get('deploy', 'project')
        _create_default_setup_py(settings=settings, project=project)
    d = 'dist'
    retry_on_eintr(
        check_call,
        [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d],
        stdout=sys.stdout,
        stderr=sys.stderr)
    egg = glob.glob(os.path.join(d, '*.egg'))[0]
    return egg, d
示例#31
0
def add_schedule():
    scrapy_cfg = closest_scrapy_cfg()
    cp = SafeConfigParser()
    if scrapy_cfg:
        cp.read(scrapy_cfg)

    parser = OptionParser(prog='scrapydd add_schedule')
    parser.add_option('-p', '--project', help='the project name')
    parser.add_option('-s', '--spider', help='the spider name')
    parser.add_option('-d', '--schedule', help='cron expression of schedule')
    parser.add_option('--host', help='the server address')
    opts, args = parser.parse_args()

    try:
        project = opts.project or cp.get('deploy', 'project')
    except Error:
        print 'Error: project is required'
        parser.print_help()
        return

    spider = opts.spider
    if spider is None:
        print 'Error: spider is required'
        parser.print_help()
        return

    try:
        host = opts.host or cp.get('deploy', 'url')
    except Error:
        print 'Error: host is required'
        parser.print_help()
        return

    schedule = opts.schedule
    if schedule is None:
        print 'Error: schedule is required'
        parser.print_help()
        return
    url = urlparse.urljoin(host, '/add_schedule.json')
    postdata = urllib.urlencode({
        'project': project,
        'spider': spider,
        'cron': schedule
    })
    res = urllib2.urlopen(url, postdata)
    print res.read()
示例#32
0
def find_scrapy_project(project, crawler="jianshu"):
    # project_config_path = closest_scrapy_cfg("./crawler/jianshu/")
    project_dir = "./crawler/{}/".format(crawler)
    print("project_dir======={}".format(project_dir))
    project_config_path = closest_scrapy_cfg(project_dir)
    if not project_config_path:
        raise RuntimeError('Cannot find scrapy.cfg file')
    project_config = SafeConfigParser()
    project_config.read(project_config_path)
    try:
        project_settings = project_config.get('settings', project)
    except (NoSectionError, NoOptionError) as e:
        raise RuntimeError(e.message)
    if not project_settings:
        raise RuntimeError('Cannot find scrapy project settings')
    project_location = os.path.dirname(project_config_path)
    sys.path.append(project_location)
    return project_settings
示例#33
0
文件: sold0.py 项目: GPSBach/luther
class TruliaSpider(scrapy.Spider):
    name = 'sold_0'
    allowed_domains = ['trulia.com']
    custom_settings = {'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/iterate/sold_%(start)s_%(time)s.jl'), 
                       'FEED_FORMAT': 'jsonlines'}

    def __init__(self, state = 'IL', city = 'Chicago', start = 1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.state = state
        self.city = city
        self.start = start
        self.start_urls = ['https://www.trulia.com/sold/{city},{state}/'.format(state=state, city=city)]
        self.le = LinkExtractor(allow=r'^https://www.trulia.com/p/')

    def parse(self, response):
        #N = 598 #trulia.TruliaSpider.get_number_of_pages_to_scrape(response)
        M = self.start
        N = M+49
        self.logger.info("Seaching between index page {M} and index page {N} ".format(N=N,M=M))
        for url in [response.urljoin("{n}_p/".format(n=n)) for n in range(M, N+1)]:
            yield scrapy.Request(url=url, callback=self.parse_index_page)

    def parse_index_page(self, response):
        for link in self.le.extract_links(response):
            yield scrapy.Request(url=link.url, callback=self.parse_property_page)

    def parse_property_page(self, response):
        item_loader = TruliaItemLoader(item=TruliaItem(), response=response)
        trulia.TruliaSpider.load_common_fields(item_loader=item_loader, response=response)

        details = item_loader.nested_css('.homeDetailsHeading')
        taxes = details.nested_xpath('.//*[text() = "Property Taxes and Assessment"]/parent::div')
        taxes.add_xpath('property_tax_assessment_year', './following-sibling::div/div[contains(text(), "Year")]/following-sibling::div/text()')
        taxes.add_xpath('property_tax', './following-sibling::div/div[contains(text(), "Tax")]/following-sibling::div/text()')
        taxes.add_xpath('property_tax_assessment_land', './following-sibling::div/div/div[contains(text(), "Land")]/following-sibling::div/text()')
        taxes.add_xpath('property_tax_assessment_improvements', './following-sibling::div/div/div[contains(text(), "Improvements")]/following-sibling::div/text()')
        taxes.add_xpath('property_tax_assessment_total', './following-sibling::div/div/div[contains(text(), "Total")]/following-sibling::div/text()')
        taxes.add_xpath('property_tax_market_value', './following-sibling::div/div[contains(text(), "Market Value")]/following-sibling::div/text()')

        item = item_loader.load_item()
        trulia.TruliaSpider.post_process(item=item)
        return item
示例#34
0
class TruliaSpider(scrapy.Spider):
    name = 'trulia_sold'
    allowed_domains = ['trulia.com']
    custom_settings = {'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/data_sold_%(state)s_%(city)s_%(time)s.jl'), 
                       'FEED_FORMAT': 'jsonlines'}

    def __init__(self, state='CA', city='San_Francisco', *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.state = state
        self.city = city
        self.start_urls = ['http://trulia.com/sold/{city},{state}/'.format(state=state, city=city)]
        self.le = LinkExtractor(allow=r'^https://www.trulia.com/homes/.+/sold/')

    def parse(self, response):
        N = trulia.TruliaSpider.get_number_of_pages_to_scrape(response)
        self.logger.info("Determined that property pages are contained on {N} different index pages, each containing at most 30 properties. Proceeding to scrape each index page...".format(N=N))
        for url in [response.urljoin("{n}_p/".format(n=n)) for n in range(1, N+1)]:
            yield scrapy.Request(url=url, callback=self.parse_index_page)

    def parse_index_page(self, response):
        for link in self.le.extract_links(response):
            yield scrapy.Request(url=link.url, callback=self.parse_property_page)

    def parse_property_page(self, response):
        item_loader = TruliaItemLoader(item=TruliaItem(), response=response)
        trulia.TruliaSpider.load_common_fields(item_loader=item_loader, response=response)

        details = item_loader.nested_css('.homeDetailsHeading')
        taxes = details.nested_xpath('.//*[text() = "Property Taxes and Assessment"]/parent::div')
        taxes.add_xpath('property_tax_assessment_year', './following-sibling::div/div[contains(text(), "Year")]/following-sibling::div/text()')
        taxes.add_xpath('property_tax', './following-sibling::div/div[contains(text(), "Tax")]/following-sibling::div/text()')
        taxes.add_xpath('property_tax_assessment_land', './following-sibling::div/div/div[contains(text(), "Land")]/following-sibling::div/text()')
        taxes.add_xpath('property_tax_assessment_improvements', './following-sibling::div/div/div[contains(text(), "Improvements")]/following-sibling::div/text()')
        taxes.add_xpath('property_tax_assessment_total', './following-sibling::div/div/div[contains(text(), "Total")]/following-sibling::div/text()')
        taxes.add_xpath('property_tax_market_value', './following-sibling::div/div[contains(text(), "Market Value")]/following-sibling::div/text()')

        item = item_loader.load_item()
        trulia.TruliaSpider.post_process(item=item)
        return item
示例#35
0
def inside_project():
    return bool(closest_scrapy_cfg())
示例#36
0
文件: sale_0.py 项目: GPSBach/luther
class TruliaSpider(scrapy.Spider):
    name = 'sale_0'
    allowed_domains = ['trulia.com']
    custom_settings = {
        'FEED_URI':
        os.path.join(os.path.dirname(closest_scrapy_cfg()),
                     'data/iterate/sale_%(start)s_%(time)s.jl'),
        'FEED_FORMAT':
        'jsonlines'
    }

    def __init__(self, state='IL', city='Chicago', start=1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.state = state
        self.city = city
        self.start = start
        self.start_urls = [
            'https://www.trulia.com/{state}/{city}'.format(state=state,
                                                           city=city)
        ]
        self.le = LinkExtractor(allow=r'^https://www.trulia.com/p/')

    def parse(self, response):
        #N = self.get_number_of_pages_to_scrape(response)
        M = self.start
        N = M + 49  #self.stop
        self.logger.info(
            "Seaching between index page {M} and index page {N} ".format(N=N,
                                                                         M=M))
        for url in [
                response.urljoin("{n}_p/".format(n=n))
                for n in range(M, N + 1)
        ]:
            yield scrapy.Request(url=url, callback=self.parse_index_page)

    @staticmethod
    def get_number_of_pages_to_scrape(response):
        #pagination = response.css('.paginationContainer').xpath('.//*/text()[contains(., "Results")]')
        #number_of_results = int(pagination.re_first(r'^1 - 30 of ([\d,]+) Results$').replace(',', ''))
        number_of_results = 249
        return math.ceil(number_of_results / 30)

    def parse_index_page(self, response):
        for link in self.le.extract_links(response):
            yield scrapy.Request(url=link.url,
                                 callback=self.parse_property_page)

    def parse_property_page(self, response):
        l = TruliaItemLoader(item=TruliaItem(), response=response)
        self.load_common_fields(item_loader=l, response=response)

        listing_information = l.nested_xpath(
            '//span[text() = "LISTING INFORMATION"]')
        listing_information.add_xpath(
            'listing_information',
            './parent::div/following-sibling::ul[1]/li/text()')
        listing_information.add_xpath('listing_information_date_updated',
                                      './following-sibling::span/text()',
                                      re=r'^Updated: (.*)')

        public_records = l.nested_xpath('//span[text() = "PUBLIC RECORDS"]')
        public_records.add_xpath(
            'public_records',
            './parent::div/following-sibling::ul[1]/li/text()')
        public_records.add_xpath('public_records_date_updated',
                                 './following-sibling::span/text()',
                                 re=r'^Updated: (.*)')

        item = l.load_item()
        self.post_process(item=item)
        return item

    @staticmethod
    def load_common_fields(item_loader, response):
        '''Load field values which are common to "on sale" and "recently sold" properties.'''
        item_loader.add_value('url', response.url)
        item_loader.add_xpath('address', '//*[@data-role="address"]/text()')
        item_loader.add_xpath('city_state',
                              '//*[@data-role="cityState"]/text()')
        item_loader.add_xpath('price',
                              '//span[@data-role="price"]/text()',
                              re=r'\$([\d,]+)')
        item_loader.add_xpath(
            'neighborhood',
            '//*[@data-role="cityState"]/parent::h1/following-sibling::span/a/text()'
        )
        details = item_loader.nested_css('.homeDetailsHeading')
        overview = details.nested_xpath(
            './/span[contains(text(), "Overview")]/parent::div/following-sibling::div[1]'
        )
        overview.add_xpath('overview', xpath='.//li/text()')
        overview.add_xpath('area', xpath='.//li/text()', re=r'([\d,]+) sqft$')
        overview.add_xpath('lot_size',
                           xpath='.//li/text()',
                           re=r'([\d,.]+) (?:acres|sqft) lot size$')
        overview.add_xpath('lot_size_units',
                           xpath='.//li/text()',
                           re=r'[\d,.]+ (acres|sqft) lot size$')
        overview.add_xpath('price_per_square_foot',
                           xpath='.//li/text()',
                           re=r'\$([\d,.]+)/sqft$')
        overview.add_xpath('bedrooms',
                           xpath='.//li/text()',
                           re=r'(\d+) (?:Beds|Bed|beds|bed)$')
        overview.add_xpath('bathrooms',
                           xpath='.//li/text()',
                           re=r'(\d+) (?:Baths|Bath|baths|bath)$')
        overview.add_xpath('year_built',
                           xpath='.//li/text()',
                           re=r'Built in (\d+)')
        overview.add_xpath('days_on_Trulia',
                           xpath='.//li/text()',
                           re=r'([\d,]) days on Trulia$')
        overview.add_xpath('views',
                           xpath='.//li/text()',
                           re=r'([\d,]+) views$')
        #item_loader.add_css('description', '#descriptionContainer *::text')

        price_events = details.nested_xpath(
            './/*[text() = "Price History"]/parent::*/following-sibling::*[1]/div/div'
        )
        price_events.add_xpath('prices', './div[contains(text(), "$")]/text()')
        price_events.add_xpath(
            'dates',
            './div[contains(text(), "$")]/preceding-sibling::div/text()')
        price_events.add_xpath(
            'events',
            './div[contains(text(), "$")]/following-sibling::div/text()')

    @staticmethod
    def post_process(item):
        '''Add any additional data to an item after loading it'''
        if item.get('dates') is not None:
            dates = [
                datetime.datetime.strptime(date, '%m/%d/%Y')
                for date in item['dates']
            ]
            prices = [
                int(price.lstrip('$').replace(',', ''))
                for price in item['prices']
            ]
            item['price_history'] = sorted(list(
                zip(dates, prices, item['events'])),
                                           key=lambda x: x[0])
示例#37
0
def project_root():
    return os.path.dirname(closest_scrapy_cfg())
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.utils.conf import closest_scrapy_cfg
import csv
import random
import os

proj_root = os.path.dirname(closest_scrapy_cfg())
basepath = proj_root + "/site_scraper/"  #path to folder containing "spiders" directory

spiderpath = basepath + "spiders/"
urlfile = open(spiderpath + "urls.csv", 'r')
domainfile = open(spiderpath + "domains.csv", 'r')

r1 = csv.reader(urlfile)
next(r1)
r2 = csv.reader(domainfile)
next(r2)
urls = [row[0] for row in r1]
domains = [row[0] for row in r2]
counter = 0


class TdSpider(CrawlSpider):
    name = 'td'
    allowed_domains = domains
    start_urls = urls
    rules = (Rule(LinkExtractor(), callback="parse_obj", follow=True), )
示例#39
0
# -*- coding: utf-8 -*-
import os
from scrapy.utils.conf import closest_scrapy_cfg

closest = closest_scrapy_cfg()
assert closest
# 获得项目的绝对目录
projdir = os.path.dirname(closest)

BOT_NAME = 'Crawler'
SPIDER_MODULES = ['Crawler.spiders']
NEWSPIDER_MODULE = 'Crawler.spiders'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 16
# Maximum number of concurrent items (per response) to process in parallel in the Item Processor
CONCURRENT_ITEMS = 1000
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN=16

# CONCURRENT_REQUESTS_PER_DOMAIN = 5
# CONCURRENT_REQUESTS_PER_IP = 6
# CONCURRENT_REQUESTS_PER_IP=16

# The maximum depth that will be allowed to crawl for any site. If zero, no limit will be imposed.
DEPTH_LIMIT = 0
# An integer that is used to adjust the request priority based on its depth.If zero, no priority adjustment is made from depth.
DEPTH_PRIORITY = 0
示例#40
0
    def __init__(self, settings):
        self.scopes = settings['GOOGLE_AUTH_SCOPES']

        root_path = '/'.join(closest_scrapy_cfg().split('/')[0:-1])
        bot_path = root_path + '/' + settings['BOT_NAME'] + '/'
        self.credentials = settings['GOOGLE_AUTH_CREDENTIAL_PATH'] if settings['GOOGLE_AUTH_CREDENTIAL_PATH'][0:1] == '/' else bot_path + settings['GOOGLE_AUTH_CREDENTIAL_PATH']
示例#41
0
 def __init__(self, settings):
     self.settings = settings
     self.scrapy_module_path = os.path.dirname(closest_scrapy_cfg())
     self.spider_modules = settings.getlist('SPIDER_MODULES')