def fetchListPages(self, listtype="html"): print "Start to fetch and parse List" urls = self.listRule.getListUrls() for url in urls: print "Fetching list page: ", url, "charset:", safestr(self.seed["charset"]), "timeout:", safestr(self.seed["timeout"]) f = Fetch(url, charset = self.seed["charset"], timeout = self.seed["timeout"]) if f.isReady(): doc = f.read() if listtype == "html": self.parseListPage(f, doc, url) elif listtype == "json": self.parseJsonPage(f, doc, url) print "List has finished parsing. It has %s docs." % ansicolor.red(self.__len__())
def runTest(self): try: f = Fetch("http://tga.plu.cn") if f.connected: f.read() print f.getCharset(), f.getCode(), f.isReady() except Exception, e: print e
def parseFeed(self): print "Start to fetch and parse Feed list" seed = self.seed f = Fetch(seed.prefixurl, seed.charset, self.seed.timeout); if f.isReady(): feed = feedparser.parse(f.read()) items = feed["entries"] if len(items) > 0: for item in items: _item = Item({ "url" : item["link"], "type" : self.seed_type }) if self.guid_rule is None: self.guid_rule = "url" guid = self.getItemGUID(item) self.items[guid] = _item print "List has finished parsing. It has %s docs." % ansicolor.red(self.__len__())