def getPageLink(self, selector, prefix): xpath = '//div[@class="pager"]//a/@href' eles = selector.xpath(xpath).extract() for i in range(len(eles)): eles[i] = StrUtil.completeURL(prefix, eles[i]) return filter(StrUtil.isEmpty, eles)
def getTag(self, selector, item): xpath = '//div[@class="side-tags clearfix"]/div/a/text()' tag = "" tags = selector.xpath(xpath).extract() for i in range(len(tags)): if (i): tag = tag + "-" + StrUtil.delWhiteSpace(tags[i]) else: tag = StrUtil.delWhiteSpace(tags[i]) if (0 != len(tag)): item['tag'] = tag else: item['tag'] = "NULL" LogUtil.log("tag(%s)" % item['tag']) return
def getCategory(self, selector, item): xpath = '//dd[@class="tag-box"]/a/text()' category = "" categories = selector.xpath(xpath).extract() for i in range(len(categories)): if (i): category = category + "-" + StrUtil.delWhiteSpace( categories[i]) else: category = StrUtil.delWhiteSpace(categories[i]) if (0 != len(category)): item['category'] = category else: item['category'] = "NULL" LogUtil.log("category(%s)" % item['category']) return
def getEditorComment(self, selector, item): xpath = '//div[@class="app-detail"]//span[@class="head-content"]/text()' eles = selector.xpath(xpath).extract() editor_comment = "NULL" if (0 != len(eles)): editor_comment = eles[0] item['editor_comment'] = StrUtil.delWhiteSpace(editor_comment) LogUtil.log("editor_comment(%s)" % item['editor_comment']) return
def getName(self, selector, item): xpath = '//div[@class="app-intro"]//h1[@class="app-name"]/span/text()' eles = selector.xpath(xpath).extract() name = "NULL" if (0 != len(eles)): name = eles[0] item['name'] = StrUtil.delWhiteSpace(name) LogUtil.log("name(%s)" % item['name']) return
def getVersion(self, selector, item): # xpath = '//dl[@class="infos-list"]/dd[5]/text()' xpath = u'//dl[@class="infos-list"]/dt[text() = "版本"]/following::*[1]/text()' eles = selector.xpath(xpath).extract() if (0 != len(eles)): item['version'] = StrUtil.delWhiteSpace(eles[0]) else: item['version'] = "NULL" LogUtil.log("version(%s)" % item['version']) return
def getName(self, selector, item): xpath = '//p[@class="app-name"]/span[@class="title" and @itemprop="name"]/text()' eles = selector.xpath(xpath).extract() name = "NULL" if (0 != len(eles)): name = eles[0] item['name'] = StrUtil.delWhiteSpace(name) LogUtil.log("name(%s)" % item['name']) return
def getDescInfo(self, selector, item): xpath = '//div[@class="app-detail"]//div[@class="brief-long"]/p//text()' eles = selector.xpath(xpath).extract() # eles = selector.xpath(xpath).xpath('string(., " ")').extract() desc_info = "NULL" if (0 != len(eles)): desc_info = " ".join(eles) item['desc_info'] = StrUtil.delWhiteSpace(desc_info) LogUtil.log("desc_info(%s)" % item['desc_info']) return
def getDescInfo(self, selector, item): xpath = '//div[@itemprop="description"]//text()' eles = selector.xpath(xpath).extract() # eles = selector.xpath(xpath).xpath('string(., " ")').extract() desc_info = "NULL" if (0 != len(eles)): desc_info = " ".join(eles) item['desc_info'] = StrUtil.delWhiteSpace(desc_info) LogUtil.log("desc_info(%s)" % item['desc_info']) return
def getSource(self, selector, item): xpath = '//div[@class="app-intro"]//div[@class="origin-wrap"]//a[@class="origin"]/text()' item['source'] = "NULL" while True: eles = selector.xpath(xpath).extract() if (0 == len(eles)): break string = eles[0] item['source'] = StrUtil.delWhiteSpace(string) break LogUtil.log("source(%s)" % item['source']) return
def loadStartURLs(self): prefix = "http://www.wandoujia.com/apps/" # 文件URL file = open('data/apps.txt', 'r') for line in file: self.start_urls.append(prefix + StrUtil.delWhiteSpace(line)) file.close() # 固定URL self.start_urls.append("http://www.wandoujia.com/apps") # 应用首页 self.start_urls.append("http://www.wandoujia.com/category/app") # 安卓软件 self.start_urls.append( "http://www.wandoujia.com/category/game") # 安卓游戏 # self.start_urls.append("http://www.wandoujia.com/apps/air.jp.funkyland.AliceHouse2") # 旧版应用 # self.start_urls.append("http://www.wandoujia.com/apps/com.tencent.mm") # 新版应用 # self.start_urls.append("http://www.wandoujia.com/category/408") # 旅游出行首页 return