def parse_xpath(self, response, xpath): appItemList = [] sel = Selector(response) for url in sel.xpath(xpath).extract(): url = urljoin(response.url, url) log.msg("Catch an application: %s" % url, level=log.INFO) appItem = AppItem() appItem['url'] = url appItemList.append(appItem) return appItemList
def parse_xpath(self, response, xpath): appItemList = [] sel = Selector(response) for url in response.xpath(xpath).extract(): url = urljoin(response.url, url) logging.info("Catch an application: %s", url) appItem = AppItem() appItem['url'] = url appItemList.append(appItem) return appItemList
def parse_anzhi(response): xpath = "//div[@id='btn']/a/@onclick" appItemList = [] hxs = HtmlXPathSelector(response) for script in hxs.select(xpath).extract(): id = re.search(r"\d+", script).group() url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id, ) appItem = AppItem() appItem['url'] = url appItemList.append(appItem) return appItemList
def parse_anzhi(response): xpath = "//div[@class='detail_down']/a/@onclick" appItemList = [] sel = Selector(response) for script in sel.xpath(xpath).extract(): id = re.search(r"\d+", script).group() url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id, ) appItem = AppItem() appItem['url'] = url appItemList.append(appItem) return appItemList
def parse_xpath(self, response, xpath, key): appItemList = [] name_xpath_rule = self.scrape_rules['name_xpath'] type_xpath_rule = self.scrape_rules['type_xpath'] size_xpath_rule = self.scrape_rules['size_xpath'] description_xpath_rule = self.scrape_rules['description_xpath'] version_xpath_rule = self.scrape_rules['version_xpath'] time_xpath_rule = self.scrape_rules['time_xpath'] versionInfo_xpath_rule = self.scrape_rules['versionInfo_xpath'] sel = Selector(text=response.body) for url in sel.xpath(xpath).extract(): url = urljoin(response.url, url) # log.info("Catch an application: %s" % url, level=log.INFO) # self.logger.info("Catch an application: %s",url) appItem = AppItem() appItem['url'] = url # appItemList.append(appItem) # appItem['app_name'] = sel.xpath(name_xpath_rule[key]).extract()[0] app_name = ''.join(sel.xpath(name_xpath_rule[key]).extract()) appItem['app_name'] = ''.join(app_name.split()) # print isinstance(appItem['app_name'], unicode) # # app_name.encode("utf-8") # print appItem['app_name'] app_type = ''.join(sel.xpath(type_xpath_rule[key]).extract()) appItem['app_type'] = ''.join(app_type.split()) # app_size = ''.join(sel.xpath(size_xpath_rule[key]).extract()) # appItem['app_size'] = ''.join(app_size.split()) app_description = ''.join( sel.xpath(description_xpath_rule[key]).extract()).replace( '<br />', '') appItem['app_description'] = ''.join(app_description.split()) app_size = ''.join(sel.xpath(size_xpath_rule[key]).extract()) appItem['app_size'] = ''.join(app_size.split()) app_version = ''.join(sel.xpath(version_xpath_rule[key]).extract()) appItem['app_version'] = ''.join(app_version.split()) app_time = ''.join(sel.xpath(time_xpath_rule[key]).extract()) appItem['app_time'] = ''.join(app_time.split()) app_versionInfo = ''.join( sel.xpath(versionInfo_xpath_rule[key]).extract()).replace( '<br />', '') appItem['app_versionInfo'] = ''.join(app_versionInfo.split()) appItemList.append(appItem) return appItemList
def parse_xpath(self, response, xpath): appItemList = [] hxs = HtmlXPathSelector(response) for url in hxs.select(xpath).extract(): url = urljoin(response.url, url) log.msg("Catch an application: %s" % url, level=log.INFO) appItem = AppItem() appItem['url'] = url appItemList.append(appItem) return appItemList #def parse_anzhi(self, response, xpath): # appItemList = [] # hxs = HtmlXPathSelector(response) # for script in hxs.select(xpath).extract(): # id = re.search(r"\d+", script).group() # url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id,) # appItem = AppItem() # appItem['url'] = url # appItemList.append(appItem) # return appItemList
def parse_anzhi(self,response,key): xpath = "//div[@class='detail_down']/a/@onclick" appItemList = [] name_xpath_rule = self.scrape_rules['name_xpath'] type_xpath_rule = self.scrape_rules['type_xpath'] size_xpath_rule = self.scrape_rules['size_xpath'] description_xpath_rule = self.scrape_rules['description_xpath'] sel = Selector(text=response.body) for script in sel.xpath(xpath).extract(): id = re.search(r"\d+", script).group() url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id,) appItem = AppItem() appItem['url'] = url # appItemList.append(appItem) app_name = ''.join(sel.xpath(name_xpath_rule[key]).extract()) appItem['app_name'] = ''.join(app_name.split()) app_type = ''.join(sel.xpath(type_xpath_rule[key]).extract()) appItem['app_type'] = ''.join(app_type.split()) app_size = ''.join(sel.xpath(size_xpath_rule[key]).extract()) appItem['app_size'] = ''.join(app_size.split()) app_description = ''.join(sel.xpath(description_xpath_rule[key]).extract()).replace('<br />','') appItem['app_description'] = ''.join(app_description.split()) appItemList.append(appItem) return appItemList