示例#1
0
    def run(self, args, opts):
        if len(args) != 1:
            raise UsageError()
        project_name = args[0]
        if not re.search(r'^[_a-zA-Z]\w*$', project_name):
            print('Error: Project names must begin with a letter and contain only\n' \
                'letters, numbers and underscores')
            sys.exit(1)
        elif exists(project_name):
            print("Error: directory %r already exists" % project_name)
            sys.exit(1)

        moduletpl = join(TEMPLATES_PATH, 'module')
        copytree(moduletpl, join(project_name, project_name), ignore=IGNORE)
        shutil.copy(join(TEMPLATES_PATH, 'scrapy.cfg'), project_name)
        for paths in TEMPLATES_TO_RENDER:
            path = join(*paths)
            tplfile = join(project_name,
                string.Template(path).substitute(project_name=project_name))
            render_templatefile(tplfile, project_name=project_name,
                ProjectName=string_camelcase(project_name))
        print("New Scrapy project %r created in:" % project_name)
        print("    %s\n" % abspath(project_name))
        print("You can start your first spider with:")
        print("    cd %s" % project_name)
        print("    scrapy genspider example example.com")
示例#2
0
    def run(self, project, opt=None):

        project_name = project
        project_dir = project

        if exists(join(project_dir, 'scrapy.cfg')):
            self.exitcode = 1
            print('Error: scrapy.cfg already exists in %s' % abspath(project_dir))
            return

        if not self._is_valid_name(project_name):
            self.exitcode = 1
            return

        self._copytree(self.templates_dir, abspath(project_dir))
        move(join(project_dir, 'module'), join(project_dir, project_name))
        for paths in TEMPLATES_TO_RENDER:
            path = join(*paths)
            tplfile = join(project_dir,
                string.Template(path).substitute(project_name=project_name))
            render_templatefile(tplfile, project_name=project_name,
                ProjectName=string_camelcase(project_name))
        print("New web-walker project %r, using template directory %r, created in:" % \
              (project_name, self.templates_dir))
        print("    %s\n" % abspath(project_dir))
        print("You can start the demo spider with:")
        print("    custom-redis-server --host 127.0.0.1 -p 6379")
        print("    cd %s" % project_dir)
        print("    scrapy crawl bluefly")
示例#3
0
    def run(self, args, opts):
        if len(args) not in (1, 2):
            raise UsageError()

        project_name = args[0]
        project_dir = args[0]

        if len(args) == 2:
            project_dir = args[1]

        if exists(join(project_dir, 'scrapy.cfg')):
            self.exitcode = 1
            print('Error: scrapy.cfg already exists in %s' % abspath(project_dir))
            return

        if not self._is_valid_name(project_name):
            self.exitcode = 1
            return

        self._copytree(self.templates_dir, abspath(project_dir))
        move(join(project_dir, 'module'), join(project_dir, project_name))
        for paths in TEMPLATES_TO_RENDER:
            path = join(*paths)
            tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name))
            render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name))
        print("New Scrapy project '%s', using template directory '%s', "
              "created in:" % (project_name, self.templates_dir))
        print("    %s\n" % abspath(project_dir))
        print("You can start your first spider with:")
        print("    cd %s" % project_dir)
        print("    scrapy genspider example example.com")
示例#4
0
    def run(self, args, opts):
        if len(args) not in (1, 2):
            raise UsageError()

        project_name = args[0]
        project_dir = args[0]

        if len(args) == 2:
            project_dir = args[1]

        if exists(join(project_dir, 'scrapy.cfg')):
            self.exitcode = 1
            print('Error: scrapy.cfg already exists in %s' % abspath(project_dir))
            return

        if not self._is_valid_name(project_name):
            self.exitcode = 1
            return

        self._copytree(self.templates_dir, abspath(project_dir))
        move(join(project_dir, 'module'), join(project_dir, project_name))
        for paths in TEMPLATES_TO_RENDER:
            path = join(*paths)
            tplfile = join(project_dir,
                string.Template(path).substitute(project_name=project_name))
            render_templatefile(tplfile, project_name=project_name,
                ProjectName=string_camelcase(project_name))
        print("New Scrapy project %r, using template directory %r, created in:" % \
              (project_name, self.templates_dir))
        print("    %s\n" % abspath(project_dir))
        print("You can start your first spider with:")
        print("    cd %s" % project_dir)
        print("    scrapy genspider example example.com")
示例#5
0
 def _genspider(self, module, name, domain, template_name, template_file):
     """Generate the spider module, based on the given template"""
     tvars = {
         'project_name':
         self.settings.get('BOT_NAME'),
         'ProjectName':
         string_camelcase(self.settings.get('BOT_NAME')),
         'module':
         module,
         'name':
         name,
         'domain':
         domain,
         'classname':
         '%sSpider' % ''.join(s.capitalize() for s in module.split('_'))
     }
     if self.settings.get('NEWSPIDER_MODULE'):
         spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
         spiders_dir = abspath(dirname(spiders_module.__file__))
     else:
         spiders_module = None
         spiders_dir = "."
     spider_file = "%s.py" % join(spiders_dir, module)
     shutil.copyfile(template_file, spider_file)
     render_templatefile(spider_file, **tvars)
     print("Created spider %r using template %r " % (name, template_name),
           end=('' if spiders_module else '\n'))
     if spiders_module:
         print("in module:\n  %s.%s" % (spiders_module.__name__, module))
示例#6
0
 def _genspider(self, module, name, domain, template_name, template_file):
     """Generate the spider module, based on the given template"""
     capitalized_module = "".join(s.capitalize() for s in module.split("_"))
     tvars = {
         "project_name": self.settings.get("BOT_NAME"),
         "ProjectName": string_camelcase(self.settings.get("BOT_NAME")),
         "module": module,
         "name": name,
         "domain": domain,
         "classname": f"{capitalized_module}Spider",
     }
     if self.settings.get("NEWSPIDER_MODULE"):
         spiders_module = import_module(self.settings["NEWSPIDER_MODULE"])
         spiders_dir = abspath(dirname(spiders_module.__file__))
     else:
         spiders_module = None
         spiders_dir = "."
     spider_file = f"{join(spiders_dir, module)}.py"
     shutil.copyfile(template_file, spider_file)
     render_templatefile(spider_file, **tvars)
     print(
         f"Created spider {name!r} using template {template_name!r} ",
         end=("" if spiders_module else "\n"),
     )
     if spiders_module:
         print(f"in module:\n  {spiders_module.__name__}.{module}")
示例#7
0
    def run(self, args, opts):
        if len(args) != 1:
            raise UsageError()
        project_name = args[0]

        if not self._is_valid_name(project_name):
            self.exitcode = 1
            return

        moduletpl = join(TEMPLATES_PATH, 'module')
        copytree(moduletpl, join(project_name, project_name), ignore=IGNORE)
        shutil.copy(join(TEMPLATES_PATH, 'scrapy.cfg'), project_name)
        for paths in TEMPLATES_TO_RENDER:
            path = join(*paths)
            tplfile = join(
                project_name,
                string.Template(path).substitute(project_name=project_name))
            render_templatefile(tplfile,
                                project_name=project_name,
                                ProjectName=string_camelcase(project_name))
        print("New Scrapy project %r created in:" % project_name)
        print("    %s\n" % abspath(project_name))
        print("You can start your first spider with:")
        print("    cd %s" % project_name)
        print("    scrapy genspider example example.com")
    def run(self, project, opt=None):

        project_name = project
        project_dir = project

        if exists(join(project_dir, 'scrapy.cfg')):
            self.exitcode = 1
            print('Error: scrapy.cfg already exists in %s' % abspath(project_dir))
            return

        if not self._is_valid_name(project_name):
            self.exitcode = 1
            return

        self._copytree(self.templates_dir, abspath(project_dir))
        move(join(project_dir, 'module'), join(project_dir, project_name))
        for paths in (('scrapy.cfg',),('${project_name}', 'settings.py.tmpl'),):
            path = join(*paths)
            tplfile = join(project_dir,
                string.Template(path).substitute(project_name=project_name))
            render_templatefile(tplfile, project_name=project_name,
                ProjectName=string_camelcase(project_name))
        print("New structure-spider project %r, using template directory %r, created in:" % \
              (project_name, self.templates_dir))
        print("    %s\n" % abspath(project_dir))
        print("You can start the spider with:")
        print("    cd %s" % project_dir)
        print("    custom-redis-server -ll INFO -lf &")
        print("    scrapy crawl douban")
示例#9
0
    def run(self, args, opts):
        if len(args) != 1:
            raise UsageError()
        classifier_name = args[0]
        if not re.search(r'^[_a-z]*$', classifier_name):
            print('Error: Classifier names must be entirely lower case')
            sys.exit(1)
        elif exists("{0}data{0}{1}".format(os.sep, classifier_name)):
            print("Error: directory %r already exists" % classifier_name)
            sys.exit(1)
        #If this is the first classifier
        if not os.path.exists("data"):
            os.makedirs("data")
            with open("data/__init__.py", "wb") as package_file:
                package_file.close()
        if not os.path.exists("to_upload"):
            os.makedirs("to_upload")

        #Make classifier file
        moduletpl = join(TEMPLATES_PATH, 'classifier')
        copytree(moduletpl,
                 join(CLASSIFIERS_PATH, classifier_name),
                 ignore=IGNORE)
        for paths in TEMPLATES_TO_RENDER:
            path = join(*paths)
            tplfile = join(
                CLASSIFIERS_PATH,
                string.Template(path).substitute(
                    classifier_name=classifier_name))
            render_templatefile(
                tplfile,
                classifier_name=classifier_name,
                ClassifierName=string_camelcase(classifier_name))
        #Make settings.cfg file
        config = ConfigParser.RawConfigParser()
        config.add_section("Classifier")
        classifications = raw_input(
            "Please input classifications separated by commas\n").split(",")
        config.set("Classifier", "classes",
                   ",".join(sorted(c.strip() for c in classifications)))
        for class_type in config.get("Classifier", "classes").split(","):
            keep = int(
                raw_input(
                    "Collect data classified as {0}?\n1. Yes\n 2. No".format(
                        class_type)))
            if keep == 1:
                config.set("Classifier", class_type, True)
            else:
                config.set("Classifier", class_type, False)
        with open("data/{0}/settings.cfg".format(classifier_name),
                  "wb") as configfile:
            config.write(configfile)
示例#10
0
 def _gen_tests(self, name, classname, start_url, fixture_file, template_file):
     """Creates tests from test template file"""
     template_dict = {
         "name": name,
         "classname": classname,
         "fixture_file": fixture_file,
         "date_str": datetime.now().strftime("%Y-%m-%d"),
     }
     if "legistar" not in name:
         template_dict["start_url"] = start_url
     test_file = join(self.tests_dir, "test_{}.py".format(name))
     shutil.copyfile(join(self.templates_dir, template_file), test_file)
     render_templatefile(test_file, **template_dict)
     print("Created file: {}".format(test_file))
示例#11
0
 def _genspider(self, name, agency, classname, domain, start_url, template_file):
     """Create spider from custom template"""
     template_dict = {
         "name": name,
         "agency": agency,
         "domain": domain,
         "start_url": start_url,
         "classname": "{}Spider".format(
             string.capwords(name, sep="_").replace("_", "")
         ),
     }
     spider_file = "{}.py".format(join(self.spiders_dir, name))
     shutil.copyfile(join(self.templates_dir, template_file), spider_file)
     render_templatefile(spider_file, **template_dict)
     print("Created file: {}".format(spider_file))
示例#12
0
 def _genspider(self, module, name, domain, template_name, template_file):
     """Generate the spider module, based on the given template"""
     tvars = {
         "project_name": self.settings.get("BOT_NAME"),
         "ProjectName": string_camelcase(self.settings.get("BOT_NAME")),
         "module": module,
         "name": name,
         "domain": domain,
         "classname": "%sSpider" % "".join([s.capitalize() for s in module.split("_")]),
     }
     spiders_module = import_module(self.settings["NEWSPIDER_MODULE"])
     spiders_dir = abspath(dirname(spiders_module.__file__))
     spider_file = "%s.py" % join(spiders_dir, module)
     shutil.copyfile(template_file, spider_file)
     render_templatefile(spider_file, **tvars)
     print("Created spider %r using template %r in module:" % (name, template_name))
     print("  %s.%s" % (spiders_module.__name__, module))
示例#13
0
文件: genspider.py 项目: 1012/scrapy
 def _genspider(self, module, name, domain, template_name, template_file):
     """Generate the spider module, based on the given template"""
     tvars = {
         'project_name': self.settings.get('BOT_NAME'),
         'ProjectName': string_camelcase(self.settings.get('BOT_NAME')),
         'module': module,
         'name': name,
         'domain': domain,
         'classname': '%sSpider' % ''.join([s.capitalize() \
             for s in module.split('_')])
     }
     spiders_module = __import__(self.settings['NEWSPIDER_MODULE'], {}, {}, [''])
     spiders_dir = abspath(dirname(spiders_module.__file__))
     spider_file = "%s.py" % join(spiders_dir, module)
     shutil.copyfile(template_file, spider_file)
     render_templatefile(spider_file, **tvars)
     print "Created spider %r using template %r in module:" % (name, \
         template_name)
     print "  %s.%s" % (spiders_module.__name__, module)
示例#14
0
 def _genspider(self, module, name, domain, template_name, template_file):
     """Generate the spider module, based on the given template"""
     tvars = {
         'project_name': settings.get('BOT_NAME'),
         'ProjectName': string_camelcase(settings.get('BOT_NAME')),
         'module': module,
         'name': name,
         'domain': domain,
         'classname': '%sSpider' % ''.join([s.capitalize() \
             for s in module.split('_')])
     }
     spiders_module = __import__(settings['NEWSPIDER_MODULE'], {}, {}, [''])
     spiders_dir = abspath(dirname(spiders_module.__file__))
     spider_file = "%s.py" % join(spiders_dir, module)
     shutil.copyfile(template_file, spider_file)
     render_templatefile(spider_file, **tvars)
     print "Created spider %r using template %r in module:" % (name, \
         template_name)
     print "  %s.%s" % (spiders_module.__name__, module)
示例#15
0
    def run(self, args, opts):
        if len(args) != 1:
            raise UsageError()
        project_name = args[0]
        if not re.search(r'^[_a-zA-Z]\w*$', project_name):
            print 'Error: Project names must begin with a letter and contain only\n' \
                'letters, numbers and underscores'
            sys.exit(1)
        elif exists(project_name):
            print "Error: directory %r already exists" % project_name
            sys.exit(1)

        moduletpl = join(TEMPLATES_PATH, 'module')
        copytree(moduletpl, join(project_name, project_name), ignore=IGNORE)
        shutil.copy(join(TEMPLATES_PATH, 'scrapy.cfg'), project_name)
        for paths in TEMPLATES_TO_RENDER:
            path = join(*paths)
            tplfile = join(project_name,
                string.Template(path).substitute(project_name=project_name))
            render_templatefile(tplfile, project_name=project_name,
                ProjectName=string_camelcase(project_name))
示例#16
0
    def test_simple_render(self):

        context = dict(project_name='proj', name='spi', classname='TheSpider')
        template = u'from ${project_name}.spiders.${name} import ${classname}'
        rendered = u'from proj.spiders.spi import TheSpider'

        template_path = os.path.join(self.tmp_path, 'templ.py.tmpl')
        render_path = os.path.join(self.tmp_path, 'templ.py')

        with open(template_path, 'wb') as tmpl_file:
            tmpl_file.write(template.encode('utf8'))
        assert os.path.isfile(template_path)  # Failure of test itself

        render_templatefile(template_path, **context)

        self.assertFalse(os.path.exists(template_path))
        with open(render_path, 'rb') as result:
            self.assertEqual(result.read().decode('utf8'), rendered)

        os.remove(render_path)
        assert not os.path.exists(render_path)  # Failure of test iself
示例#17
0
    def test_simple_render(self):

        context = dict(project_name="proj", name="spi", classname="TheSpider")
        template = "from ${project_name}.spiders.${name} import ${classname}"
        rendered = "from proj.spiders.spi import TheSpider"

        template_path = os.path.join(self.tmp_path, "templ.py.tmpl")
        render_path = os.path.join(self.tmp_path, "templ.py")

        with open(template_path, "wb") as tmpl_file:
            tmpl_file.write(template.encode("utf8"))
        assert os.path.isfile(template_path)  # Failure of test itself

        render_templatefile(template_path, **context)

        self.assertFalse(os.path.exists(template_path))
        with open(render_path, "rb") as result:
            self.assertEqual(result.read().decode("utf8"), rendered)

        os.remove(render_path)
        assert not os.path.exists(render_path)  # Failure of test iself
示例#18
0
    def test_simple_render(self):

        context = dict(project_name='proj', name='spi', classname='TheSpider')
        template = 'from ${project_name}.spiders.${name} import ${classname}'
        rendered = 'from proj.spiders.spi import TheSpider'

        template_path = os.path.join(self.tmp_path, 'templ.py.tmpl')
        render_path = os.path.join(self.tmp_path, 'templ.py')

        with open(template_path, 'wb') as tmpl_file:
            tmpl_file.write(template.encode('utf8'))
        assert os.path.isfile(template_path)  # Failure of test itself

        render_templatefile(template_path, **context)

        self.assertFalse(os.path.exists(template_path))
        with open(render_path, 'rb') as result:
            self.assertEqual(result.read().decode('utf8'), rendered)

        os.remove(render_path)
        assert not os.path.exists(render_path)  # Failure of test iself
示例#19
0
 def run(self, args, opts):
     if len(args) != 1:
         raise UsageError()
     classifier_name = args[0]
     if not re.search(r'^[_a-z]*$', classifier_name):
         print('Error: Classifier names must be entirely lower case')
         sys.exit(1)
     elif exists("{0}data{0}{1}".format(os.sep, classifier_name)):
         print("Error: directory %r already exists" % classifier_name)
         sys.exit(1)
     #If this is the first classifier
     if not os.path.exists("data"):
         os.makedirs("data")
         with open("data/__init__.py", "wb") as package_file:
             package_file.close()
     if not os.path.exists("to_upload"):
         os.makedirs("to_upload")
         
     #Make classifier file
     moduletpl = join(TEMPLATES_PATH, 'classifier')
     copytree(moduletpl, join(CLASSIFIERS_PATH, classifier_name), ignore=IGNORE)
     for paths in TEMPLATES_TO_RENDER:
         path = join(*paths)
         tplfile = join(CLASSIFIERS_PATH,
             string.Template(path).substitute(classifier_name=classifier_name))
         render_templatefile(tplfile, classifier_name=classifier_name,
             ClassifierName=string_camelcase(classifier_name))
     #Make settings.cfg file
     config = ConfigParser.RawConfigParser()        
     config.add_section("Classifier")
     classifications = raw_input("Please input classifications separated by commas\n").split(",")
     config.set("Classifier", "classes", ",".join(sorted(c.strip() for c in classifications)))
     for class_type in config.get("Classifier", "classes").split(","):
         keep = int(raw_input("Collect data classified as {0}?\n1. Yes\n 2. No".format(class_type)))
         if keep == 1: 
             config.set("Classifier", class_type, True)
         else:
             config.set("Classifier", class_type, False)
     with open("data/{0}/settings.cfg".format(classifier_name), "wb") as configfile:                
         config.write(configfile)
示例#20
0
    def run(self, args, opts):
        if len(args) not in (1, 2):
            raise UsageError()
        # 项目名
        project_name = args[0]
        # 项目目录
        project_dir = args[0]

        if len(args) == 2:
            project_dir = args[1]
        # 该项目根目录下是否存在scrapy.cfg文件
        if exists(join(project_dir, 'scrapy.cfg')):
            self.exitcode = 1
            print('Error: scrapy.cfg already exists in %s' %
                  abspath(project_dir))
            return
        # 项目名正确性校验
        if not self._is_valid_name(project_name):
            self.exitcode = 1
            return
        # 将模板目录templates下内容拷贝到当前项目下
        self._copytree(self.templates_dir, abspath(project_dir))
        # 将module替换成项目名
        move(join(project_dir, 'module'), join(project_dir, project_name))
        # 选择爬虫类模板文件,并填充内容,生成代码文件
        for paths in TEMPLATES_TO_RENDER:
            path = join(*paths)
            tplfile = join(
                project_dir,
                string.Template(path).substitute(project_name=project_name))
            render_templatefile(tplfile,
                                project_name=project_name,
                                ProjectName=string_camelcase(project_name))
        print("New Scrapy project %r, using template directory %r, created in:" % \
              (project_name, self.templates_dir))
        print("    %s\n" % abspath(project_dir))
        print("You can start your first spider with:")
        print("    cd %s" % project_dir)
        print("    scrapy genspider example example.com")
示例#21
0
    def run(self, args, opts):
        if len(args) != 1:
            raise UsageError()
        project_name = args[0]

        if not self._is_valid_name(project_name):
            self.exitcode = 1
            return

        copytree(self.templates_dir, project_name, ignore=IGNORE)
        move(join(project_name, 'module'), join(project_name, project_name))
        for paths in TEMPLATES_TO_RENDER:
            path = join(*paths)
            tplfile = join(project_name,
                string.Template(path).substitute(project_name=project_name))
            render_templatefile(tplfile, project_name=project_name,
                ProjectName=string_camelcase(project_name))
        print("New Scrapy project {0!r}, using template directory {1!r}, created in:".format(project_name, self.templates_dir))
        print("    {0!s}\n".format(abspath(project_name)))
        print("You can start your first spider with:")
        print("    cd {0!s}".format(project_name))
        print("    scrapy genspider example example.com")
示例#22
0
    def run(self, args, opts):
        if len(args) != 1:
            raise UsageError()
        project_name = args[0]

        if not self._is_valid_name(project_name):
            self.exitcode = 1
            return

        moduletpl = join(TEMPLATES_PATH, 'module')
        copytree(moduletpl, join(project_name, project_name), ignore=IGNORE)
        shutil.copy(join(TEMPLATES_PATH, 'scrapy.cfg'), project_name)
        for paths in TEMPLATES_TO_RENDER:
            path = join(*paths)
            tplfile = join(project_name,
                string.Template(path).substitute(project_name=project_name))
            render_templatefile(tplfile, project_name=project_name,
                ProjectName=string_camelcase(project_name))
        print("New Scrapy project %r created in:" % project_name)
        print("    %s\n" % abspath(project_name))
        print("You can start your first spider with:")
        print("    cd %s" % project_name)
        print("    scrapy genspider example example.com")
示例#23
0
 def _genspider(self, module, name, domain, template_name, template_file):
     """Generate the spider module, based on the given template"""
     tvars = {
         'project_name': self.settings.get('BOT_NAME'),
         'ProjectName': string_camelcase(self.settings.get('BOT_NAME')),
         'module': module,
         'name': name,
         'domain': domain,
         'classname': '%sSpider' % ''.join(s.capitalize() \
             for s in module.split('_'))
     }
     if self.settings.get('NEWSPIDER_MODULE'):
         spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
         spiders_dir = abspath(dirname(spiders_module.__file__))
     else:
         spiders_module = None
         spiders_dir = "."
     spider_file = "%s.py" % join(spiders_dir, module)
     shutil.copyfile(template_file, spider_file)
     render_templatefile(spider_file, **tvars)
     print("Created spider %r using template %r " % (name, \
         template_name), end=('' if spiders_module else '\n'))
     if spiders_module:
         print("in module:\n  %s.%s" % (spiders_module.__name__, module))
示例#24
0
 def _genspider(self, module, name, domain, template_name, template_file):
     """Generate the spider module, based on the given template"""
     capitalized_module = ''.join(s.capitalize() for s in module.split('_'))
     tvars = {
         'project_name': self.settings.get('BOT_NAME'),
         'ProjectName': string_camelcase(self.settings.get('BOT_NAME')),
         'module': module,
         'name': name,
         'domain': domain,
         'classname': f'{capitalized_module}Spider'
     }
     if self.settings.get('NEWSPIDER_MODULE'):
         spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
         spiders_dir = abspath(dirname(spiders_module.__file__))
     else:
         spiders_module = None
         spiders_dir = "."
     spider_file = f"{join(spiders_dir, module)}.py"
     shutil.copyfile(template_file, spider_file)
     render_templatefile(spider_file, **tvars)
     print(f"Created spider {name!r} using template {template_name!r} ",
           end=('' if spiders_module else '\n'))
     if spiders_module:
         print("in module:\n  {spiders_module.__name__}.{module}")