def setUp(self):
        self.parser = ApacheParser()

        root_page_file = open("test_apache7.html", "r")
        self.root_page = root_page_file.read()
        self.base_url = "http://www.serenitystreetnews.com/videos/feb 2013/"
        root_page_file.close()
示例#2
0
    def __init__(self, url, test_url):
        self.files = []
        self.parsed_urls = []
        self.base_url = url

        if url.startswith("http"):
            if test_url:
                # Test url
                try:
                    r = requests.get(self.base_url, timeout=10)  # todo change to 30

                    if r.status_code == 200:
                        self.parser = self.guess_parser(r.text, r.headers)()

                        print("Using " + self.parser.__class__.__name__ + " as parser")
                    else:
                        print("Couldn't connect (" + str(r.status_code) + ")")
                        self.parser = None

                except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError):
                    print("Timed out / Connection refused")
                    self.parser = None
            else:
                print("Using ApacheParser by default because test_url was set to False")
                self.parser = ApacheParser()  # Default parser
        else:
            print("Invalid Schema")
            self.parser = None
    def setUp(self):
        self.parser = ApacheParser()

        root_page_file = open("test.html", "r")
        self.root_page = root_page_file.read()
        self.base_url = "http://archive.scene.org/pub/resources/docs/"
        root_page_file.close()
    def setUp(self):
        self.parser = ApacheParser()

        root_page_file = open("test_apache4.html", "r")
        self.root_page = root_page_file.read()
        self.base_url = "http://jenserserver.no-ip.biz/movieserver/serien/bigbangtheorie/S3/"
        root_page_file.close()
    def setUp(self):
        self.parser = ApacheParser()

        root_page_file = open("test_apache3.html", "r")
        self.root_page = root_page_file.read()
        self.base_url = "http://files.duspectacle.com/mp3/Jardinets/"
        root_page_file.close()
    def setUp(self):
        self.parser = ApacheParser()

        root_page_file = open("test_apache2.html", "r")
        self.root_page = root_page_file.read()
        self.base_url = "http://akiraito.jpn.ph/g/%E6%98%A0%E7%94%BB%E3%83%BB%E3%83%89%E3%83%A9%E3%83%9E%E3%83%BB%E3%82%A2%E3%83%8B%E3%83%A1/%E3%80%90%E3%82%A2%E3%83%8B%E3%83%A1%E3%80%91%20%E3%83%89%E3%83%A9%E3%82%B4%E3%83%B3%E3%83%9C%E3%83%BC%E3%83%AB/%E3%80%90%E3%82%A2%E3%83%8B%E3%83%A1%E3%80%91%20%E3%83%89%E3%83%A9%E3%82%B4%E3%83%B3%E3%83%9C%E3%83%BC%E3%83%AB%EF%BC%BA%E3%80%80%E5%85%A8%EF%BC%92%EF%BC%99%EF%BC%91%E8%A9%B1/"
        root_page_file.close()
class ApacheParserTest2(TestCase):
    def setUp(self):
        self.parser = ApacheParser()

        root_page_file = open("test_apache2.html", "r")
        self.root_page = root_page_file.read()
        self.base_url = "http://akiraito.jpn.ph/g/%E6%98%A0%E7%94%BB%E3%83%BB%E3%83%89%E3%83%A9%E3%83%9E%E3%83%BB%E3%82%A2%E3%83%8B%E3%83%A1/%E3%80%90%E3%82%A2%E3%83%8B%E3%83%A1%E3%80%91%20%E3%83%89%E3%83%A9%E3%82%B4%E3%83%B3%E3%83%9C%E3%83%BC%E3%83%AB/%E3%80%90%E3%82%A2%E3%83%8B%E3%83%A1%E3%80%91%20%E3%83%89%E3%83%A9%E3%82%B4%E3%83%B3%E3%83%9C%E3%83%BC%E3%83%AB%EF%BC%BA%E3%80%80%E5%85%A8%EF%BC%92%EF%BC%99%EF%BC%91%E8%A9%B1/"
        root_page_file.close()

    def test_link_count(self):

        result = self.parser.get_links(self.root_page, self.base_url)

        self.assertEqual(len(result), 297)

    def test_link_size(self):
        result = self.parser.get_links(self.root_page, self.base_url)

        self.assertEqual(
            result[
                "ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ011˜b.u‰F’ˆˆê‚Ì‹­íŽmƒTƒCƒ„l‚ß‚´‚ß‚éIv.wmv"]
            ["size"], 232185000)
        self.assertEqual(
            result[
                "ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ019˜b.ud—͂Ƃ̐킢Iƒoƒuƒ‹ƒXŒN‚ð‚‚©‚Ü‚¦‚ëv.wmv"]
            ["size"], 185385000)

    def test_link_type(self):
        result = self.parser.get_links(self.root_page, self.base_url)

        self.assertEqual(
            result[
                "ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ011˜b.u‰F’ˆˆê‚Ì‹­íŽmƒTƒCƒ„l‚ß‚´‚ß‚éIv.wmv"]
            ["type"], "f")
        self.assertEqual(result["ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z%20jpg/"]["type"], "d")

    def test_link_extension(self):
        result = self.parser.get_links(self.root_page, self.base_url)

        self.assertEqual(
            result[
                "ƒhƒ‰ƒSƒ“ƒ{[ƒ‹Z.‘æ011˜b.u‰F’ˆˆê‚Ì‹­íŽmƒTƒCƒ„l‚ß‚´‚ß‚éIv.wmv"]
            ["ext"], "wmv")
class ApacheParserTest3(TestCase):
    def setUp(self):
        self.parser = ApacheParser()

        root_page_file = open("test_apache3.html", "r")
        self.root_page = root_page_file.read()
        self.base_url = "http://files.duspectacle.com/mp3/Jardinets/"
        root_page_file.close()

    def test_link_count(self):

        result = self.parser.get_links(self.root_page, self.base_url)

        self.assertEqual(len(result), 21)

    def test_link_size(self):
        result = self.parser.get_links(self.root_page, self.base_url)

        self.assertEqual(
            result["15%20Woodkid%20-%20Iron%20(Remix%20By%20Gucci%20Vump).mp3"]
            ["size"], 9300000)
        self.assertEqual(
            result["16%20Yellow%20Ostrich%20-%20WHALE.mp3"]["size"], 7100000)

    def test_link_type(self):
        result = self.parser.get_links(self.root_page, self.base_url)

        self.assertEqual(
            result["15%20Woodkid%20-%20Iron%20(Remix%20By%20Gucci%20Vump).mp3"]
            ["type"], "f")
        self.assertEqual(
            result[
                "01%20Jean%20Rochefort%20-%20Winnie%20et%20ses%20amis%20(introduction)/"]
            ["type"], "d")

    def test_link_extension(self):
        result = self.parser.get_links(self.root_page, self.base_url)

        self.assertEqual(
            result["15%20Woodkid%20-%20Iron%20(Remix%20By%20Gucci%20Vump).mp3"]
            ["ext"], "mp3")
class ApacheParserTest5(TestCase):
    def setUp(self):
        self.parser = ApacheParser()

        root_page_file = open("test.html", "r")
        self.root_page = root_page_file.read()
        self.base_url = "http://archive.scene.org/pub/resources/docs/"
        root_page_file.close()

    def test_link_size(self):
        result = self.parser.get_links(self.root_page, self.base_url)

        self.assertEqual(result["17toilet.txt"]["size"], 12700)
        self.assertEqual(result["288help.diz"]["size"], 9000)
示例#10
0
    def guess_parser(text, headers):

        server = headers["Server"] if "Server" in headers else ""

        # try nginx
        parser = NginxParser()
        if parser.page_is_valid(text):
            return NginxParser

        # Try apache
        parser = ApacheParser()
        if parser.page_is_valid(text):
            return ApacheParser

        return None
class ApacheParserTest7(TestCase):
    def setUp(self):
        self.parser = ApacheParser()

        root_page_file = open("test_apache7.html", "r")
        self.root_page = root_page_file.read()
        self.base_url = "http://www.serenitystreetnews.com/videos/feb 2013/"
        root_page_file.close()

    def test_link_size(self):
        result = self.parser.get_links(self.root_page, self.base_url)

        self.assertEqual(
            result[
                "700%20Emerald%20Tablets%20Dark%20Brothers%20-%20YouTube.flv"]
            ["size"], 145000000)
        self.assertEqual(
            result[
                "Economic%20Collapse%20Survival%20Map%20-%20Risk%20Analysis%20of%20best%20area%20in%20United%20States%20-%20YouTube.flv"]
            ["size"], 28000000)
class ApacheParserTest4(TestCase):
    def setUp(self):
        self.parser = ApacheParser()

        root_page_file = open("test_apache4.html", "r")
        self.root_page = root_page_file.read()
        self.base_url = "http://jenserserver.no-ip.biz/movieserver/serien/bigbangtheorie/S3/"
        root_page_file.close()

    def test_link_size(self):
        result = self.parser.get_links(self.root_page, self.base_url)

        self.assertEqual(
            result[
                "The.Big.Bang.Theory.S03E06.Football.fuer.Nerds.German.WS.DVDRip.XviD-DELiCiOUS.avi"]
            ["size"], 175000000)
        self.assertEqual(
            result[
                "The.Big.Bang.Theory.S03E03.Sex.oder.Pralinen.German.WS.DVDRip.XviD-DELiCiOUS.avi"]
            ["size"], 0)
class ApacheParserTest(TestCase):
    def setUp(self):
        self.parser = ApacheParser()

        root_page_file = open("test_apache1.html", "r")
        self.root_page = root_page_file.read()
        root_page_file.close()

    def test_size_column(self):
        result = self.parser.get_size_columns(
            ['</a>', '175289', 'kB', '2008/10/21', '09:00:02', ''], "")
        result1 = self.parser.get_size_columns([
            '100pxfilename.jpg', '175289', 'kB', '2008/10/21', '09:00:02', ''
        ], "100pxfilename.jpg")

        self.assertEqual(result, (1, 2))
        self.assertEqual(result1, (1, 2))

    def test_link_count(self):

        result = self.parser.get_links(self.root_page,
                                       "https://keisari.net/videos/")
        self.assertEqual(len(result), 51)

    def test_link_size(self):
        result = self.parser.get_links(self.root_page,
                                       "https://keisari.net/videos/")

        self.assertEqual(result["happyday.mp4"]["size"], 772000)
        self.assertEqual(
            result["alex_r%c3%a4j%c3%a4ht%c3%a4%c3%a4.mp4"]["size"], 715000)

    def test_link_type(self):
        result = self.parser.get_links(self.root_page,
                                       "https://keisari.net/videos/")

        self.assertEqual(result["arnold_brownschwagger.mp4"]["type"], "f")
        self.assertEqual(result["to_be_continued/"]["type"], "d")

    def test_link_extension(self):
        result = self.parser.get_links(self.root_page,
                                       "https://keisari.net/videos/")

        self.assertEqual(result["webm_thread_intro.mp4"]["ext"], "mp4")
示例#14
0
class Crawler:

    def __init__(self, url, test_url):
        self.files = []
        self.parsed_urls = []
        self.base_url = url

        if url.startswith("http"):
            if test_url:
                # Test url
                try:
                    r = requests.get(self.base_url, timeout=10)  # todo change to 30

                    if r.status_code == 200:
                        self.parser = self.guess_parser(r.text, r.headers)()

                        print("Using " + self.parser.__class__.__name__ + " as parser")
                    else:
                        print("Couldn't connect (" + str(r.status_code) + ")")
                        self.parser = None

                except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError):
                    print("Timed out / Connection refused")
                    self.parser = None
            else:
                print("Using ApacheParser by default because test_url was set to False")
                self.parser = ApacheParser()  # Default parser
        else:
            print("Invalid Schema")
            self.parser = None

    @staticmethod
    def guess_parser(text, headers):

        server = headers["Server"] if "Server" in headers else ""

        # try nginx
        parser = NginxParser()
        if parser.page_is_valid(text):
            return NginxParser

        # Try apache
        parser = ApacheParser()
        if parser.page_is_valid(text):
            return ApacheParser

        return None

    def crawl(self, address=None):

        # Prevent unwanted recursion
        if address is not None and address in self.parsed_urls:
            return
        self.parsed_urls.append(address)

        if self.parser is None:
            return

        if address is None:
            address = self.base_url

        if not address.startswith(self.base_url):
            print("Skipping " + address + " because it does not match " + self.base_url)
            return

        retries = 20
        while retries >= 0:
            try:
                response = requests.get(address, timeout=10)
                break
            except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError):
                print("Timeout, " + str(retries) + " retries left")
                retries -= 1

                if retries == 0:
                    return

        links = self.parser.get_links(response.text, address)

        for k in links:
            if links[k]["type"] == "d":
                print(links[k]["link"])
                self.crawl(links[k]["link"])
            else:
                self.files.append(dict(link=links[k]["link"], size=links[k]["size"], ext=links[k]["ext"]))

    def store_report(self, report_id, title):
        report_saver = ReportSaver(self.files,title, ReportBuilder(self.files, self.base_url))

        with open("static/reports/" + report_id + "_chart.json", "w") as f:
            f.write(report_saver.to_json_chart())
        with open("static/reports/" + report_id + ".json", "w") as f:
            f.write(report_saver.to_json())
        with open("static/reports/" + report_id + ".txt", "w") as f:
            f.write(report_saver.to_link_list())
    def setUp(self):
        self.parser = ApacheParser()

        root_page_file = open("test_apache1.html", "r")
        self.root_page = root_page_file.read()
        root_page_file.close()