def __init__(self, node_type=None, conf_path=None, opts=None): ''' Set up main class ''' if conf_path is None: conf_path = '/etc/cauthon' if opts: self.opts = opts else: self.opts = cauthon.config.load_config( '{0}/cauthon'.format(conf_path), ) self.node_type = node_type self.master_opts = salt.config.master_config('{0}/master'.format(conf_path)) self.minion_opts = salt.config.minion_config('{0}/minion'.format(conf_path)) header_dict = {} header_dict['User-agent'] = self.opts.get('user-agent', DEFAULT_AGENT) self.session = requests.Session() self.session.auth = self.opts.get('auth', None) self.session.verify = self.opts.get('verify_ssl', True) self.session.headers.update(header_dict) cookie_jar = self.opts.get( 'cookie_jar', '/var/cache/cauthon/cookies.txt' ) self.session.cookies = cookielib.LWPCookieJar(cookie_jar) if not os.path.isfile(cookie_jar): self.session.cookies.save() self.session.cookies.load() self.proxies = self.opts.get('proxies', {}) self.base_dir = self.opts.get('base_dir', '/var/cache/cauthon/sites') self.db = Database() # pylint: disable=invalid-name self.db.connect(self.opts.get('db_driver', 'sqlite3')) self.sitemap_load() self.filters = Filters()
class Crawler(object): ''' Connection object for Cauthon ''' def __init__(self, node_type=None, conf_path=None, opts=None): ''' Set up main class ''' if conf_path is None: conf_path = '/etc/cauthon' if opts: self.opts = opts else: self.opts = cauthon.config.load_config( '{0}/cauthon'.format(conf_path), ) self.node_type = node_type self.master_opts = salt.config.master_config('{0}/master'.format(conf_path)) self.minion_opts = salt.config.minion_config('{0}/minion'.format(conf_path)) header_dict = {} header_dict['User-agent'] = self.opts.get('user-agent', DEFAULT_AGENT) self.session = requests.Session() self.session.auth = self.opts.get('auth', None) self.session.verify = self.opts.get('verify_ssl', True) self.session.headers.update(header_dict) cookie_jar = self.opts.get( 'cookie_jar', '/var/cache/cauthon/cookies.txt' ) self.session.cookies = cookielib.LWPCookieJar(cookie_jar) if not os.path.isfile(cookie_jar): self.session.cookies.save() self.session.cookies.load() self.proxies = self.opts.get('proxies', {}) self.base_dir = self.opts.get('base_dir', '/var/cache/cauthon/sites') self.db = Database() # pylint: disable=invalid-name self.db.connect(self.opts.get('db_driver', 'sqlite3')) self.sitemap_load() self.filters = Filters() def sitemap_load(self): ''' Load the site.map into self.site_map ''' sitemap_file = self.opts.get('site_map', '/etc/cauthon/site.map') with salt.utils.fopen(sitemap_file, 'r') as sm_: self.site_map = yaml.safe_load(sm_) def sitemap_append(self, domain, fname): ''' Load the site.map into self.site_map ''' sitemap_file = self.opts.get('site_map', '/etc/cauthon/site.map') self.site_map[domain] = fname with salt.utils.fopen(sitemap_file, 'w') as sm_: yaml.dump(self.site_map, sm_, default_flow_style=False) def fetch(self, #pylint: disable=too-many-arguments url, method='GET', params=None, data=None, force=False, req_kwargs=None): ''' Fetch a URL and stash in a database as necessary ''' if req_kwargs is None: req_kwargs = {} content = None cached = False if force is False: data = self.db.client.execute( 'SELECT content FROM sites WHERE url = ?', (url,) ) row = data.fetchone() if row: content = row[0] cached = True if content is None: result = self.session.request( method, url, params=params, data=data, proxies=self.proxies, **req_kwargs ) else: result = ReqHook(content) parser = bs4.BeautifulSoup(result.content, 'html.parser') title = parser.find_all('title')[0].contents[0] if not cached: self.db.insert( 'sites', url, result.content, title, str(int(time.time())), ) return result, parser def scrape(self, url, module=None): ''' Scrape a URL and collect some links ''' return self.filters.scrape(self, url, module) def download(self, url): ''' Download the links returned by scrape() ''' print('Downloading {0}'.format(url)) for link in self.filters.scrape(self, url): if 'Referer' in self.session.headers: del self.session.headers['Referer'] self.session.headers.update({'Referer': str(url)}) stream = requests.get( link, proxies=self.proxies, headers=self.session.headers, stream=True, ) urlparser = urlparse.urlparse(link) comps = urlparser.path.split('/') out_path = os.path.join( self.base_dir, urlparser.netloc, *comps[:-1] ) try: os.makedirs(out_path) except OSError: pass out_file = os.path.join(self.base_dir, urlparser.netloc, *comps) if not os.path.exists(out_file): print('Saving {0}'.format(link)) with salt.utils.fopen(out_file, 'w') as fh_: stream.raw.decode_content = True shutil.copyfileobj(stream.raw, fh_)