def setUp(self): self.parser = ApacheParser() root_page_file = open("test_apache7.html", "r") self.root_page = root_page_file.read() self.base_url = "http://www.serenitystreetnews.com/videos/feb 2013/" root_page_file.close()
def __init__(self, url, test_url): self.files = [] self.parsed_urls = [] self.base_url = url if url.startswith("http"): if test_url: # Test url try: r = requests.get(self.base_url, timeout=10) # todo change to 30 if r.status_code == 200: self.parser = self.guess_parser(r.text, r.headers)() print("Using " + self.parser.__class__.__name__ + " as parser") else: print("Couldn't connect (" + str(r.status_code) + ")") self.parser = None except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError): print("Timed out / Connection refused") self.parser = None else: print("Using ApacheParser by default because test_url was set to False") self.parser = ApacheParser() # Default parser else: print("Invalid Schema") self.parser = None
def setUp(self): self.parser = ApacheParser() root_page_file = open("test.html", "r") self.root_page = root_page_file.read() self.base_url = "http://archive.scene.org/pub/resources/docs/" root_page_file.close()
def setUp(self): self.parser = ApacheParser() root_page_file = open("test_apache4.html", "r") self.root_page = root_page_file.read() self.base_url = "http://jenserserver.no-ip.biz/movieserver/serien/bigbangtheorie/S3/" root_page_file.close()
def setUp(self): self.parser = ApacheParser() root_page_file = open("test_apache3.html", "r") self.root_page = root_page_file.read() self.base_url = "http://files.duspectacle.com/mp3/Jardinets/" root_page_file.close()
def setUp(self): self.parser = ApacheParser() root_page_file = open("test_apache2.html", "r") self.root_page = root_page_file.read() self.base_url = "http://akiraito.jpn.ph/g/%E6%98%A0%E7%94%BB%E3%83%BB%E3%83%89%E3%83%A9%E3%83%9E%E3%83%BB%E3%82%A2%E3%83%8B%E3%83%A1/%E3%80%90%E3%82%A2%E3%83%8B%E3%83%A1%E3%80%91%20%E3%83%89%E3%83%A9%E3%82%B4%E3%83%B3%E3%83%9C%E3%83%BC%E3%83%AB/%E3%80%90%E3%82%A2%E3%83%8B%E3%83%A1%E3%80%91%20%E3%83%89%E3%83%A9%E3%82%B4%E3%83%B3%E3%83%9C%E3%83%BC%E3%83%AB%EF%BC%BA%E3%80%80%E5%85%A8%EF%BC%92%EF%BC%99%EF%BC%91%E8%A9%B1/" root_page_file.close()
class ApacheParserTest2(TestCase): def setUp(self): self.parser = ApacheParser() root_page_file = open("test_apache2.html", "r") self.root_page = root_page_file.read() self.base_url = "http://akiraito.jpn.ph/g/%E6%98%A0%E7%94%BB%E3%83%BB%E3%83%89%E3%83%A9%E3%83%9E%E3%83%BB%E3%82%A2%E3%83%8B%E3%83%A1/%E3%80%90%E3%82%A2%E3%83%8B%E3%83%A1%E3%80%91%20%E3%83%89%E3%83%A9%E3%82%B4%E3%83%B3%E3%83%9C%E3%83%BC%E3%83%AB/%E3%80%90%E3%82%A2%E3%83%8B%E3%83%A1%E3%80%91%20%E3%83%89%E3%83%A9%E3%82%B4%E3%83%B3%E3%83%9C%E3%83%BC%E3%83%AB%EF%BC%BA%E3%80%80%E5%85%A8%EF%BC%92%EF%BC%99%EF%BC%91%E8%A9%B1/" root_page_file.close() def test_link_count(self): result = self.parser.get_links(self.root_page, self.base_url) self.assertEqual(len(result), 297) def test_link_size(self): result = self.parser.get_links(self.root_page, self.base_url) self.assertEqual( result[ "ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ011˜b.u‰F’ˆˆê‚Ì‹íŽmƒTƒCƒ„l‚ß‚´‚ß‚éIv.wmv"] ["size"], 232185000) self.assertEqual( result[ "ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ019˜b.ud—Í‚Æ‚Ìí‚¢Iƒoƒuƒ‹ƒXŒN‚ð‚‚©‚Ü‚¦‚ëv.wmv"] ["size"], 185385000) def test_link_type(self): result = self.parser.get_links(self.root_page, self.base_url) self.assertEqual( result[ "ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ011˜b.u‰F’ˆˆê‚Ì‹íŽmƒTƒCƒ„l‚ß‚´‚ß‚éIv.wmv"] ["type"], "f") self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z%20jpg/"]["type"], "d") def test_link_extension(self): result = self.parser.get_links(self.root_page, self.base_url) self.assertEqual( result[ "ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ011˜b.u‰F’ˆˆê‚Ì‹íŽmƒTƒCƒ„l‚ß‚´‚ß‚éIv.wmv"] ["ext"], "wmv")
class ApacheParserTest3(TestCase): def setUp(self): self.parser = ApacheParser() root_page_file = open("test_apache3.html", "r") self.root_page = root_page_file.read() self.base_url = "http://files.duspectacle.com/mp3/Jardinets/" root_page_file.close() def test_link_count(self): result = self.parser.get_links(self.root_page, self.base_url) self.assertEqual(len(result), 21) def test_link_size(self): result = self.parser.get_links(self.root_page, self.base_url) self.assertEqual( result["15%20Woodkid%20-%20Iron%20(Remix%20By%20Gucci%20Vump).mp3"] ["size"], 9300000) self.assertEqual( result["16%20Yellow%20Ostrich%20-%20WHALE.mp3"]["size"], 7100000) def test_link_type(self): result = self.parser.get_links(self.root_page, self.base_url) self.assertEqual( result["15%20Woodkid%20-%20Iron%20(Remix%20By%20Gucci%20Vump).mp3"] ["type"], "f") self.assertEqual( result[ "01%20Jean%20Rochefort%20-%20Winnie%20et%20ses%20amis%20(introduction)/"] ["type"], "d") def test_link_extension(self): result = self.parser.get_links(self.root_page, self.base_url) self.assertEqual( result["15%20Woodkid%20-%20Iron%20(Remix%20By%20Gucci%20Vump).mp3"] ["ext"], "mp3")
class ApacheParserTest5(TestCase): def setUp(self): self.parser = ApacheParser() root_page_file = open("test.html", "r") self.root_page = root_page_file.read() self.base_url = "http://archive.scene.org/pub/resources/docs/" root_page_file.close() def test_link_size(self): result = self.parser.get_links(self.root_page, self.base_url) self.assertEqual(result["17toilet.txt"]["size"], 12700) self.assertEqual(result["288help.diz"]["size"], 9000)
def guess_parser(text, headers): server = headers["Server"] if "Server" in headers else "" # try nginx parser = NginxParser() if parser.page_is_valid(text): return NginxParser # Try apache parser = ApacheParser() if parser.page_is_valid(text): return ApacheParser return None
class ApacheParserTest7(TestCase): def setUp(self): self.parser = ApacheParser() root_page_file = open("test_apache7.html", "r") self.root_page = root_page_file.read() self.base_url = "http://www.serenitystreetnews.com/videos/feb 2013/" root_page_file.close() def test_link_size(self): result = self.parser.get_links(self.root_page, self.base_url) self.assertEqual( result[ "700%20Emerald%20Tablets%20Dark%20Brothers%20-%20YouTube.flv"] ["size"], 145000000) self.assertEqual( result[ "Economic%20Collapse%20Survival%20Map%20-%20Risk%20Analysis%20of%20best%20area%20in%20United%20States%20-%20YouTube.flv"] ["size"], 28000000)
class ApacheParserTest4(TestCase): def setUp(self): self.parser = ApacheParser() root_page_file = open("test_apache4.html", "r") self.root_page = root_page_file.read() self.base_url = "http://jenserserver.no-ip.biz/movieserver/serien/bigbangtheorie/S3/" root_page_file.close() def test_link_size(self): result = self.parser.get_links(self.root_page, self.base_url) self.assertEqual( result[ "The.Big.Bang.Theory.S03E06.Football.fuer.Nerds.German.WS.DVDRip.XviD-DELiCiOUS.avi"] ["size"], 175000000) self.assertEqual( result[ "The.Big.Bang.Theory.S03E03.Sex.oder.Pralinen.German.WS.DVDRip.XviD-DELiCiOUS.avi"] ["size"], 0)
class ApacheParserTest(TestCase): def setUp(self): self.parser = ApacheParser() root_page_file = open("test_apache1.html", "r") self.root_page = root_page_file.read() root_page_file.close() def test_size_column(self): result = self.parser.get_size_columns( ['</a>', '175289', 'kB', '2008/10/21', '09:00:02', ''], "") result1 = self.parser.get_size_columns([ '100pxfilename.jpg', '175289', 'kB', '2008/10/21', '09:00:02', '' ], "100pxfilename.jpg") self.assertEqual(result, (1, 2)) self.assertEqual(result1, (1, 2)) def test_link_count(self): result = self.parser.get_links(self.root_page, "https://keisari.net/videos/") self.assertEqual(len(result), 51) def test_link_size(self): result = self.parser.get_links(self.root_page, "https://keisari.net/videos/") self.assertEqual(result["happyday.mp4"]["size"], 772000) self.assertEqual( result["alex_r%c3%a4j%c3%a4ht%c3%a4%c3%a4.mp4"]["size"], 715000) def test_link_type(self): result = self.parser.get_links(self.root_page, "https://keisari.net/videos/") self.assertEqual(result["arnold_brownschwagger.mp4"]["type"], "f") self.assertEqual(result["to_be_continued/"]["type"], "d") def test_link_extension(self): result = self.parser.get_links(self.root_page, "https://keisari.net/videos/") self.assertEqual(result["webm_thread_intro.mp4"]["ext"], "mp4")
class Crawler: def __init__(self, url, test_url): self.files = [] self.parsed_urls = [] self.base_url = url if url.startswith("http"): if test_url: # Test url try: r = requests.get(self.base_url, timeout=10) # todo change to 30 if r.status_code == 200: self.parser = self.guess_parser(r.text, r.headers)() print("Using " + self.parser.__class__.__name__ + " as parser") else: print("Couldn't connect (" + str(r.status_code) + ")") self.parser = None except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError): print("Timed out / Connection refused") self.parser = None else: print("Using ApacheParser by default because test_url was set to False") self.parser = ApacheParser() # Default parser else: print("Invalid Schema") self.parser = None @staticmethod def guess_parser(text, headers): server = headers["Server"] if "Server" in headers else "" # try nginx parser = NginxParser() if parser.page_is_valid(text): return NginxParser # Try apache parser = ApacheParser() if parser.page_is_valid(text): return ApacheParser return None def crawl(self, address=None): # Prevent unwanted recursion if address is not None and address in self.parsed_urls: return self.parsed_urls.append(address) if self.parser is None: return if address is None: address = self.base_url if not address.startswith(self.base_url): print("Skipping " + address + " because it does not match " + self.base_url) return retries = 20 while retries >= 0: try: response = requests.get(address, timeout=10) break except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError): print("Timeout, " + str(retries) + " retries left") retries -= 1 if retries == 0: return links = self.parser.get_links(response.text, address) for k in links: if links[k]["type"] == "d": print(links[k]["link"]) self.crawl(links[k]["link"]) else: self.files.append(dict(link=links[k]["link"], size=links[k]["size"], ext=links[k]["ext"])) def store_report(self, report_id, title): report_saver = ReportSaver(self.files,title, ReportBuilder(self.files, self.base_url)) with open("static/reports/" + report_id + "_chart.json", "w") as f: f.write(report_saver.to_json_chart()) with open("static/reports/" + report_id + ".json", "w") as f: f.write(report_saver.to_json()) with open("static/reports/" + report_id + ".txt", "w") as f: f.write(report_saver.to_link_list())
def setUp(self): self.parser = ApacheParser() root_page_file = open("test_apache1.html", "r") self.root_page = root_page_file.read() root_page_file.close()