def __init__(self): rs = ReadSetting() #读取各项参数 self.start_urls = rs.readurl() self.linkmatrix = LinkMatrix(rs.projectname()) self.linkmatrix.setroot(self.start_urls) self.allowed_domains = rs.readalloweddomain() self.xpath = rs.readxpath() self.rules = [Rule(LinkExtractor(), follow=True, callback="parse_start_url")] #设置爬取规则:follow所有url;Request通过spidermiddlewares过滤掉限定域外的url;生成的response传递给parse_start_url #所有Request均经过spidermiddlewares super(XpathSpider, self).__init__()
def __init__(self): rs = ReadSetting() #读取setting文件中的保存参数 self.savename = rs.savingname() self.location = rs.savinglocation() self.saveingformat = rs.savingformat() if self.savename == 1: #判断函数self.getpath对应的函数变量(相当于函数指针) self.getpath = self.getpath_1 elif self.savename == 2: self.getpath = self.getpath_2 elif self.savename == 3: self.getpath = self.getpath_3 self.projectname = rs.projectname() try: os.mkdir(self.location) #创建下载内容所保存的文件夹(根据保存参数) except OSError as e: if e.errno == 17: pass
def __init__(self): rs = ReadSetting() #读取各项参数 self.start_urls = rs.readurl() self.linkmatrix = LinkMatrix(rs.projectname()) self.linkmatrix.setroot(self.start_urls) self.allowed_domains = rs.readalloweddomain() self.allow, self.deny = rs.readurlmatch() self.regex_allow = re.compile('({0})'.format('|'.join( [re.escape(e) for e in self.allow]))) #生成正则表达式 self.regex_deny = re.compile('({0})'.format('|'.join( [re.escape(e) for e in self.deny]))) self.rules = [ Rule(LinkExtractor(), follow=True, callback="parse_match") ] #设置爬取规则:follow所有url;Request通过spidermiddlewares过滤掉限定域外的url;生成的response传递给parse_match #所有Request均经过spidermiddlewares super(MatchSpider, self).__init__()