def fetchListPages(self, listtype="html"): print "Start to fetch and parse List" urls = self.listRule.getListUrls() for url in urls: print "Fetching list page: ", url, "charset:", safestr(self.seed["charset"]), "timeout:", safestr(self.seed["timeout"]) f = Fetch(url, charset = self.seed["charset"], timeout = self.seed["timeout"]) if f.isReady(): doc = f.read() if listtype == "html": self.parseListPage(f, doc, url) elif listtype == "json": self.parseJsonPage(f, doc, url) print "List has finished parsing. It has %s docs." % ansicolor.red(self.__len__())
def parseFeed(self): print "Start to fetch and parse Feed list" seed = self.seed f = Fetch(seed.prefixurl, seed.charset, self.seed.timeout) if f.isReady(): feed = feedparser.parse(f.read()) items = feed["entries"] if len(items) > 0: for item in items: _item = Item({"url": item["link"], "type": self.seed_type}) if self.guid_rule is None: self.guid_rule = "url" guid = self.getItemGUID(item) self.items[guid] = _item print "List has finished parsing. It has %s docs." % ansicolor.red( self.__len__())
def fetchListPages(self, listtype="html"): print "Start to fetch and parse List" urls = self.listRule.getListUrls() for url in urls: print "Fetching list page: ", url, "charset:", safestr( self.seed["charset"]), "timeout:", safestr( self.seed["timeout"]) f = Fetch(url, charset=self.seed["charset"], timeout=self.seed["timeout"]) if f.isReady(): doc = f.read() if listtype == "html": self.parseListPage(f, doc, url) elif listtype == "json": self.parseJsonPage(f, doc, url) print "List has finished parsing. It has %s docs." % ansicolor.red( self.__len__())
def parseFeed(self): print "Start to fetch and parse Feed list" seed = self.seed f = Fetch(seed.prefixurl, seed.charset, self.seed.timeout); if f.isReady(): feed = feedparser.parse(f.read()) items = feed["entries"] if len(items) > 0: for item in items: _item = Item({ "url" : item["link"], "type" : self.seed_type }) if self.guid_rule is None: self.guid_rule = "url" guid = self.getItemGUID(item) self.items[guid] = _item print "List has finished parsing. It has %s docs." % ansicolor.red(self.__len__())