def main(self): documents = [] queue = [] opener = URLOpener() parser = URLParser() db = BotDB(self.conf) parsed = [] queue += self.conf['initial']['sites'] print queue while len(queue) > 0: site = queue.pop(0) if site in parsed: continue parsed.append(site) self.logger.info("Parsing site: {0}".format(site)) self.logger.info("Len of queue: {0}".format(len(queue))) headers, data = opener.open(site) if 'Content-Type' in headers: if headers['Content-Type'].split(';')[0] == 'text/html': quad = parser.parse(site) doc = Document(quad[0], quad[1], quad[2], quad[3], headers, data) documents.append(doc) self._follow(doc, parser, queue, parsed, quad)
def main(): conf = get_config('../conf/config.yaml') documents = [] opener = URLOpener() parser = URLParser() sites = conf['initial']['sites'] for site in sites: headers, data = opener.open(site) if headers.getheader('Content-Type').split(';')[0] == 'text/html': typ = parser.parse(site) doc = Document(typ[0], typ[1], typ[2], typ[3], headers, data) documents.append(doc) print doc.get_text()