[trh.extract() for trh in trash] except IndexError: trash = body.findAll(tag) if trash: [trh.extract() for trh in trash] comments = body.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] return body # получаем настройки def getSetings(self): if not 'http://' in self.url: self.url = 'http://' + self.url n = self.url.replace('www.', '').split('/')[2] try: self.settings = SITES[n] except KeyError: self.settings = SITES['default'] # разбиваем id/class:name по двоеточию def indent(self, txt): return txt.split(':') #тест на 4х сайтах по 10 урлов с каждого if __name__ == '__main__': from saver import Saver for line in open('testLink.txt'): line = line.rstrip() test = Parser(line) f = Saver(line) f.saveFile(test.result())
# -*- coding: utf-8 -*- from parser import Parser from saver import Saver import sys try: url = sys.argv[1] except IndexError: print 'Не передан URL' sys.exit() obj = Parser(url) text = obj.result() f = Saver(url) f.saveFile(text)