def test_get_host(self): self.assertEqual(get_host('https://www.google.com/'), 'google') self.assertEqual(get_host('http://www.google.com/'), 'google') self.assertEqual(get_host('www.google.com'), 'google') self.assertEqual(get_host('google.com'), 'google') self.assertEqual(get_host('https://www.style-files.com/'), 'style-files') self.assertEqual(get_host('http://www.style-files.com/'), 'style-files') self.assertEqual(get_host('www.style-files.com/'), 'style-files') self.assertEqual(get_host('style-files.com/'), 'style-files')
def main(): description = """scrape selective site contents""" p = argparse.ArgumentParser(description=description) p.add_argument('url', help='Target URL') a = p.parse_args() # get host host = get_host(a.url) # get extractor reference e_ref = get_extractor(host) if not e_ref: logger.info('No parser for %s', host) sys.exit(0) e = e_ref(a.url) e.title() e.text() e.images()