def run(self, args, opts): if len(args) != 1: raise UsageError() project_name = args[0] if not re.search(r'^[_a-zA-Z]\w*$', project_name): print('Error: Project names must begin with a letter and contain only\n' \ 'letters, numbers and underscores') sys.exit(1) elif exists(project_name): print("Error: directory %r already exists" % project_name) sys.exit(1) moduletpl = join(TEMPLATES_PATH, 'module') copytree(moduletpl, join(project_name, project_name), ignore=IGNORE) shutil.copy(join(TEMPLATES_PATH, 'scrapy.cfg'), project_name) for paths in TEMPLATES_TO_RENDER: path = join(*paths) tplfile = join(project_name, string.Template(path).substitute(project_name=project_name)) render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name)) print("New Scrapy project %r created in:" % project_name) print(" %s\n" % abspath(project_name)) print("You can start your first spider with:") print(" cd %s" % project_name) print(" scrapy genspider example example.com")
def run(self, args, opts): if len(args) != 1: raise UsageError() project_name = args[0] if not self._is_valid_name(project_name): self.exitcode = 1 return moduletpl = join(TEMPLATES_PATH, 'module') copytree(moduletpl, join(project_name, project_name), ignore=IGNORE) shutil.copy(join(TEMPLATES_PATH, 'scrapy.cfg'), project_name) for paths in TEMPLATES_TO_RENDER: path = join(*paths) tplfile = join( project_name, string.Template(path).substitute(project_name=project_name)) render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name)) print("New Scrapy project %r created in:" % project_name) print(" %s\n" % abspath(project_name)) print("You can start your first spider with:") print(" cd %s" % project_name) print(" scrapy genspider example example.com")
def run(self, args, opts): if len(args) not in (1, 2): raise UsageError() project_name = args[0] project_dir = args[0] if len(args) == 2: project_dir = args[1] if exists(join(project_dir, 'scrapy.cfg')): self.exitcode = 1 print('Error: scrapy.cfg already exists in %s' % abspath(project_dir)) return if not self._is_valid_name(project_name): self.exitcode = 1 return self._copytree(self.templates_dir, abspath(project_dir)) move(join(project_dir, 'module'), join(project_dir, project_name)) for paths in TEMPLATES_TO_RENDER: path = join(*paths) tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name)) render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name)) print("New Scrapy project %r, using template directory %r, created in:" % \ (project_name, self.templates_dir)) print(" %s\n" % abspath(project_dir)) print("You can start your first spider with:") print(" cd %s" % project_dir) print(" scrapy genspider example example.com")
def _genspider(self, module, name, domain, template_name, template_file): """Generate the spider module, based on the given template""" tvars = { 'project_name': self.settings.get('BOT_NAME'), 'ProjectName': string_camelcase(self.settings.get('BOT_NAME')), 'module': module, 'name': name, 'domain': domain, 'classname': '%sSpider' % ''.join(s.capitalize() for s in module.split('_')) } if self.settings.get('NEWSPIDER_MODULE'): spiders_module = import_module(self.settings['NEWSPIDER_MODULE']) spiders_dir = abspath(dirname(spiders_module.__file__)) else: spiders_module = None spiders_dir = "." spider_file = "%s.py" % join(spiders_dir, module) shutil.copyfile(template_file, spider_file) render_templatefile(spider_file, **tvars) print("Created spider %r using template %r " % (name, template_name), end=('' if spiders_module else '\n')) if spiders_module: print("in module:\n %s.%s" % (spiders_module.__name__, module))
def _genspider(self, module, name, domain, template_name, template_file): """Generate the spider module, based on the given template""" capitalized_module = "".join(s.capitalize() for s in module.split("_")) tvars = { "project_name": self.settings.get("BOT_NAME"), "ProjectName": string_camelcase(self.settings.get("BOT_NAME")), "module": module, "name": name, "domain": domain, "classname": f"{capitalized_module}Spider", } if self.settings.get("NEWSPIDER_MODULE"): spiders_module = import_module(self.settings["NEWSPIDER_MODULE"]) spiders_dir = abspath(dirname(spiders_module.__file__)) else: spiders_module = None spiders_dir = "." spider_file = f"{join(spiders_dir, module)}.py" shutil.copyfile(template_file, spider_file) render_templatefile(spider_file, **tvars) print( f"Created spider {name!r} using template {template_name!r} ", end=("" if spiders_module else "\n"), ) if spiders_module: print(f"in module:\n {spiders_module.__name__}.{module}")
def run(self, project, opt=None): project_name = project project_dir = project if exists(join(project_dir, 'scrapy.cfg')): self.exitcode = 1 print('Error: scrapy.cfg already exists in %s' % abspath(project_dir)) return if not self._is_valid_name(project_name): self.exitcode = 1 return self._copytree(self.templates_dir, abspath(project_dir)) move(join(project_dir, 'module'), join(project_dir, project_name)) for paths in (('scrapy.cfg',),('${project_name}', 'settings.py.tmpl'),): path = join(*paths) tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name)) render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name)) print("New structure-spider project %r, using template directory %r, created in:" % \ (project_name, self.templates_dir)) print(" %s\n" % abspath(project_dir)) print("You can start the spider with:") print(" cd %s" % project_dir) print(" custom-redis-server -ll INFO -lf &") print(" scrapy crawl douban")
def run(self, project, opt=None): project_name = project project_dir = project if exists(join(project_dir, 'scrapy.cfg')): self.exitcode = 1 print('Error: scrapy.cfg already exists in %s' % abspath(project_dir)) return if not self._is_valid_name(project_name): self.exitcode = 1 return self._copytree(self.templates_dir, abspath(project_dir)) move(join(project_dir, 'module'), join(project_dir, project_name)) for paths in TEMPLATES_TO_RENDER: path = join(*paths) tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name)) render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name)) print("New web-walker project %r, using template directory %r, created in:" % \ (project_name, self.templates_dir)) print(" %s\n" % abspath(project_dir)) print("You can start the demo spider with:") print(" custom-redis-server --host 127.0.0.1 -p 6379") print(" cd %s" % project_dir) print(" scrapy crawl bluefly")
def run(self, args, opts): if len(args) not in (1, 2): raise UsageError() project_name = args[0] project_dir = args[0] if len(args) == 2: project_dir = args[1] if exists(join(project_dir, 'scrapy.cfg')): self.exitcode = 1 print('Error: scrapy.cfg already exists in %s' % abspath(project_dir)) return if not self._is_valid_name(project_name): self.exitcode = 1 return self._copytree(self.templates_dir, abspath(project_dir)) move(join(project_dir, 'module'), join(project_dir, project_name)) for paths in TEMPLATES_TO_RENDER: path = join(*paths) tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name)) render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name)) print("New Scrapy project '%s', using template directory '%s', " "created in:" % (project_name, self.templates_dir)) print(" %s\n" % abspath(project_dir)) print("You can start your first spider with:") print(" cd %s" % project_dir) print(" scrapy genspider example example.com")
def start_scrapy_project(project_name): """Bootstrap a portia project with default scrapy files.""" files = find_files(project_name) out_files = {} for path, contents in files.items(): contents = string.Template(contents).substitute( project_name=project_name, ProjectName=string_camelcase(project_name)) if path.endswith('.tmpl'): path = path[:-len('.tmpl')] out_files[path] = contents return out_files
def run(self, args, opts): if len(args) != 1: raise UsageError() classifier_name = args[0] if not re.search(r'^[_a-z]*$', classifier_name): print('Error: Classifier names must be entirely lower case') sys.exit(1) elif exists("{0}data{0}{1}".format(os.sep, classifier_name)): print("Error: directory %r already exists" % classifier_name) sys.exit(1) #If this is the first classifier if not os.path.exists("data"): os.makedirs("data") with open("data/__init__.py", "wb") as package_file: package_file.close() if not os.path.exists("to_upload"): os.makedirs("to_upload") #Make classifier file moduletpl = join(TEMPLATES_PATH, 'classifier') copytree(moduletpl, join(CLASSIFIERS_PATH, classifier_name), ignore=IGNORE) for paths in TEMPLATES_TO_RENDER: path = join(*paths) tplfile = join( CLASSIFIERS_PATH, string.Template(path).substitute( classifier_name=classifier_name)) render_templatefile( tplfile, classifier_name=classifier_name, ClassifierName=string_camelcase(classifier_name)) #Make settings.cfg file config = ConfigParser.RawConfigParser() config.add_section("Classifier") classifications = raw_input( "Please input classifications separated by commas\n").split(",") config.set("Classifier", "classes", ",".join(sorted(c.strip() for c in classifications))) for class_type in config.get("Classifier", "classes").split(","): keep = int( raw_input( "Collect data classified as {0}?\n1. Yes\n 2. No".format( class_type))) if keep == 1: config.set("Classifier", class_type, True) else: config.set("Classifier", class_type, False) with open("data/{0}/settings.cfg".format(classifier_name), "wb") as configfile: config.write(configfile)
def start_scrapy_project(project_name): """Bootstrap a portia project with default scrapy files.""" files = find_files(project_name) out_files = {} for path, contents in files.items(): contents = string.Template(contents).substitute( project_name=project_name, ProjectName=string_camelcase(project_name) ) if path.endswith('.tmpl'): path = path[:-len('.tmpl')] out_files[path] = contents return out_files
def _genspider(self, module, name, domain, template_name, template_file): """Generate the spider module, based on the given template""" tvars = { "project_name": self.settings.get("BOT_NAME"), "ProjectName": string_camelcase(self.settings.get("BOT_NAME")), "module": module, "name": name, "domain": domain, "classname": "%sSpider" % "".join([s.capitalize() for s in module.split("_")]), } spiders_module = import_module(self.settings["NEWSPIDER_MODULE"]) spiders_dir = abspath(dirname(spiders_module.__file__)) spider_file = "%s.py" % join(spiders_dir, module) shutil.copyfile(template_file, spider_file) render_templatefile(spider_file, **tvars) print("Created spider %r using template %r in module:" % (name, template_name)) print(" %s.%s" % (spiders_module.__name__, module))
def start_scrapy_project(project_name): """Bootstrap a portia project with default scrapy files.""" if PY2: project_name = encode(project_name) files = find_files(project_name) out_files = {} for path, contents in files.items(): contents = string.Template(contents).substitute( project_name=project_name, ProjectName=string_camelcase(project_name)) if path.endswith('.tmpl'): path = path[:-len('.tmpl')] if path.endswith('scrapy.cfg'): path = 'scrapy.cfg' out_files[path] = contents out_files['setup.py'] = SETUP(project_name) return out_files
def _genspider(self, module, name, domain, template_name, template_file): """Generate the spider module, based on the given template""" tvars = { 'project_name': settings.get('BOT_NAME'), 'ProjectName': string_camelcase(settings.get('BOT_NAME')), 'module': module, 'name': name, 'domain': domain, 'classname': '%sSpider' % ''.join([s.capitalize() \ for s in module.split('_')]) } spiders_module = __import__(settings['NEWSPIDER_MODULE'], {}, {}, ['']) spiders_dir = abspath(dirname(spiders_module.__file__)) spider_file = "%s.py" % join(spiders_dir, module) shutil.copyfile(template_file, spider_file) render_templatefile(spider_file, **tvars) print "Created spider %r using template %r in module:" % (name, \ template_name) print " %s.%s" % (spiders_module.__name__, module)
def _genspider(self, module, name, domain, template_name, template_file): """Generate the spider module, based on the given template""" tvars = { 'project_name': self.settings.get('BOT_NAME'), 'ProjectName': string_camelcase(self.settings.get('BOT_NAME')), 'module': module, 'name': name, 'domain': domain, 'classname': '%sSpider' % ''.join([s.capitalize() \ for s in module.split('_')]) } spiders_module = __import__(self.settings['NEWSPIDER_MODULE'], {}, {}, ['']) spiders_dir = abspath(dirname(spiders_module.__file__)) spider_file = "%s.py" % join(spiders_dir, module) shutil.copyfile(template_file, spider_file) render_templatefile(spider_file, **tvars) print "Created spider %r using template %r in module:" % (name, \ template_name) print " %s.%s" % (spiders_module.__name__, module)
def run(self, args, opts): if len(args) != 1: raise UsageError() project_name = args[0] if not re.search(r'^[_a-zA-Z]\w*$', project_name): print 'Error: Project names must begin with a letter and contain only\n' \ 'letters, numbers and underscores' sys.exit(1) elif exists(project_name): print "Error: directory %r already exists" % project_name sys.exit(1) moduletpl = join(TEMPLATES_PATH, 'module') copytree(moduletpl, join(project_name, project_name), ignore=IGNORE) shutil.copy(join(TEMPLATES_PATH, 'scrapy.cfg'), project_name) for paths in TEMPLATES_TO_RENDER: path = join(*paths) tplfile = join(project_name, string.Template(path).substitute(project_name=project_name)) render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name))
def run(self, args, opts): if len(args) != 1: raise UsageError() classifier_name = args[0] if not re.search(r'^[_a-z]*$', classifier_name): print('Error: Classifier names must be entirely lower case') sys.exit(1) elif exists("{0}data{0}{1}".format(os.sep, classifier_name)): print("Error: directory %r already exists" % classifier_name) sys.exit(1) #If this is the first classifier if not os.path.exists("data"): os.makedirs("data") with open("data/__init__.py", "wb") as package_file: package_file.close() if not os.path.exists("to_upload"): os.makedirs("to_upload") #Make classifier file moduletpl = join(TEMPLATES_PATH, 'classifier') copytree(moduletpl, join(CLASSIFIERS_PATH, classifier_name), ignore=IGNORE) for paths in TEMPLATES_TO_RENDER: path = join(*paths) tplfile = join(CLASSIFIERS_PATH, string.Template(path).substitute(classifier_name=classifier_name)) render_templatefile(tplfile, classifier_name=classifier_name, ClassifierName=string_camelcase(classifier_name)) #Make settings.cfg file config = ConfigParser.RawConfigParser() config.add_section("Classifier") classifications = raw_input("Please input classifications separated by commas\n").split(",") config.set("Classifier", "classes", ",".join(sorted(c.strip() for c in classifications))) for class_type in config.get("Classifier", "classes").split(","): keep = int(raw_input("Collect data classified as {0}?\n1. Yes\n 2. No".format(class_type))) if keep == 1: config.set("Classifier", class_type, True) else: config.set("Classifier", class_type, False) with open("data/{0}/settings.cfg".format(classifier_name), "wb") as configfile: config.write(configfile)
def run(self, args, opts): if len(args) not in (1, 2): raise UsageError() # 项目名 project_name = args[0] # 项目目录 project_dir = args[0] if len(args) == 2: project_dir = args[1] # 该项目根目录下是否存在scrapy.cfg文件 if exists(join(project_dir, 'scrapy.cfg')): self.exitcode = 1 print('Error: scrapy.cfg already exists in %s' % abspath(project_dir)) return # 项目名正确性校验 if not self._is_valid_name(project_name): self.exitcode = 1 return # 将模板目录templates下内容拷贝到当前项目下 self._copytree(self.templates_dir, abspath(project_dir)) # 将module替换成项目名 move(join(project_dir, 'module'), join(project_dir, project_name)) # 选择爬虫类模板文件,并填充内容,生成代码文件 for paths in TEMPLATES_TO_RENDER: path = join(*paths) tplfile = join( project_dir, string.Template(path).substitute(project_name=project_name)) render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name)) print("New Scrapy project %r, using template directory %r, created in:" % \ (project_name, self.templates_dir)) print(" %s\n" % abspath(project_dir)) print("You can start your first spider with:") print(" cd %s" % project_dir) print(" scrapy genspider example example.com")
def run(self, args, opts): if len(args) != 1: raise UsageError() project_name = args[0] if not self._is_valid_name(project_name): self.exitcode = 1 return copytree(self.templates_dir, project_name, ignore=IGNORE) move(join(project_name, 'module'), join(project_name, project_name)) for paths in TEMPLATES_TO_RENDER: path = join(*paths) tplfile = join(project_name, string.Template(path).substitute(project_name=project_name)) render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name)) print("New Scrapy project {0!r}, using template directory {1!r}, created in:".format(project_name, self.templates_dir)) print(" {0!s}\n".format(abspath(project_name))) print("You can start your first spider with:") print(" cd {0!s}".format(project_name)) print(" scrapy genspider example example.com")
def run(self, args, opts): if len(args) != 1: raise UsageError() project_name = args[0] if not self._is_valid_name(project_name): self.exitcode = 1 return moduletpl = join(TEMPLATES_PATH, 'module') copytree(moduletpl, join(project_name, project_name), ignore=IGNORE) shutil.copy(join(TEMPLATES_PATH, 'scrapy.cfg'), project_name) for paths in TEMPLATES_TO_RENDER: path = join(*paths) tplfile = join(project_name, string.Template(path).substitute(project_name=project_name)) render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name)) print("New Scrapy project %r created in:" % project_name) print(" %s\n" % abspath(project_name)) print("You can start your first spider with:") print(" cd %s" % project_name) print(" scrapy genspider example example.com")
def _genspider(self, module, name, domain, template_name, template_file): """Generate the spider module, based on the given template""" tvars = { 'project_name': self.settings.get('BOT_NAME'), 'ProjectName': string_camelcase(self.settings.get('BOT_NAME')), 'module': module, 'name': name, 'domain': domain, 'classname': '%sSpider' % ''.join(s.capitalize() \ for s in module.split('_')) } if self.settings.get('NEWSPIDER_MODULE'): spiders_module = import_module(self.settings['NEWSPIDER_MODULE']) spiders_dir = abspath(dirname(spiders_module.__file__)) else: spiders_module = None spiders_dir = "." spider_file = "%s.py" % join(spiders_dir, module) shutil.copyfile(template_file, spider_file) render_templatefile(spider_file, **tvars) print("Created spider %r using template %r " % (name, \ template_name), end=('' if spiders_module else '\n')) if spiders_module: print("in module:\n %s.%s" % (spiders_module.__name__, module))
def _genspider(self, module, name, domain, template_name, template_file): """Generate the spider module, based on the given template""" capitalized_module = ''.join(s.capitalize() for s in module.split('_')) tvars = { 'project_name': self.settings.get('BOT_NAME'), 'ProjectName': string_camelcase(self.settings.get('BOT_NAME')), 'module': module, 'name': name, 'domain': domain, 'classname': f'{capitalized_module}Spider' } if self.settings.get('NEWSPIDER_MODULE'): spiders_module = import_module(self.settings['NEWSPIDER_MODULE']) spiders_dir = abspath(dirname(spiders_module.__file__)) else: spiders_module = None spiders_dir = "." spider_file = f"{join(spiders_dir, module)}.py" shutil.copyfile(template_file, spider_file) render_templatefile(spider_file, **tvars) print(f"Created spider {name!r} using template {template_name!r} ", end=('' if spiders_module else '\n')) if spiders_module: print("in module:\n {spiders_module.__name__}.{module}")
def _genspider(self, jiraid, module, name, domain, requirement_path, url, template_name, template_file, opts): headers = self.__headers(requirement_path) val_header = headers.get('top_header') for k in ['sourceName', 'url', 'ingestion_timestamp']: val_header.pop(k) tvars = { 'project_name': self.settings.get('BOT_NAME'), 'ProjectName': string_camelcase(self.settings.get('BOT_NAME')), 'module': module, 'jiraid':jiraid, 'name': name, 'start_url':url, 'username':Utils.getUsername(), 'datetime':Utils.getCurrentDateTimeStr(), 'domain': domain, 'val_header':val_header, 'ingestion_timestamp':'Utils.getingestion_timestamp()', 'default_val':{'sourceName':name, 'url':url}, 'null_header':None, 'feed_expo':None, 'top_header':None, 'classname': '%sSpider' % ''.join(s.capitalize() \ for s in name.split('_')) } try: if self.settings.get('NEWSPIDER_MODULE'): spiders_module = import_module(self.settings['NEWSPIDER_MODULE']) spiders_dir = os.path.join(abspath(dirname(spiders_module.__file__)), jiraid) if os.path.exists(spiders_dir): print("Spider %r jiraID already exists in module:" % jiraid) return os.mkdir(spiders_dir) else: spiders_module = None spiders_dir = "." if opts.custom: import pprint pp = pprint.PrettyPrinter(indent=25, width=250) tvars['null_header'] = headers.get('null_header') tvars['feed_expo'] = pp.pformat(headers.get('feed_expo')) tvars['top_header'] = pp.pformat(headers.get('top_header')) spider_file = "%s.py" % join(spiders_dir, name) shutil.copyfile(template_file, spider_file) render_templatefile(spider_file, **tvars) if self.settings['CUSTOM_TEMPLATES_DIR']: _template_file = join(self.settings['CUSTOM_TEMPLATES_DIR'], 'items.py.tmpl') item_file = "%s.py" % join(spiders_dir, 'items') shutil.copyfile(_template_file, item_file) render_templatefile(item_file, **tvars) __init_file = "%s.py" % join(spiders_dir, '__init__') open(__init_file, 'a').close() # copy the requirement document in spider folder shutil.copyfile(requirement_path, join(spiders_dir, os.path.basename(requirement_path))) print("Created spider %r using template %r " % (name, \ template_name), end=('' if spiders_module else '\n')) if spiders_module: print("in module:\n %s.%s" % (spiders_module.__name__, module)) except Exception as e: # delete the directory if spiders_dir: FileUtils.deletePath(spiders_dir) print(e)