def __init__(self, path=''): super(ConfigLoader, self).__init__() self.logger = logger() self.path = path if len(path) else commvals.CRAWLER_CONFIG_XML self.doc = etree.parse(self.path) self.site = self.doc.xpath(u'//site') self.sites = [site.attrib.get('value', '') for site in self.site if site.attrib.get('able', 'true') == 'true']
def __init__(self, m_url, gzip=False, snapshot=False, c_time=0): threading.Thread.__init__(self) self.agents = [('Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36' ' (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36'), ('Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36' ' (KHTML, like Gecko) Chrome/40.0.2214.94 Safari/537.36'), ('Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.3; WOW64;' ' Trident/7.0; .NET4.0E; .NET4.0C; InfoPath.3)'), ('Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36' ' (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36')] self.url = m_url self.gzip = gzip self.snapshot = snapshot self.header = {'User-Agents': self.agents[random.randint(0, len(self.agents) - 1)], 'Referer': self.url} self.logger = logger() self.c_time = c_time self.simple_open_url = lambda: urllib2.urlopen(urllib2.Request(self.url, headers=self.header)).read()