示例#1
0
 def analyze_url(self, page: Page):
     self.seen_urls.add(page)
     if not self.check_domains(str(page)):
         return
     html = self.get_html(page.url)
     if html is None:
         return
     if self.analyze_robot(page.url):
         return
     if self.visited_urls_count < self.max_count_urls:
         self.visited_urls_count += 1
         parser = Parser(page.url)
         info = parser.get_info(html, str(page))
         if len(self.request.intersection(info)) != 0 \
                 and page not in self.result_urls:
             self.result_urls.add(page)
             self.update_parents()
             if self.download:
                 self.write_html(page, html)
         found_links = set(parser.get_urls(html))
         for link in found_links.difference(self.seen_urls):
             if link:
                 if str(link)[-1] == '/':
                     page = Page(link.parent)
                 else:
                     page = Page(link)
                 self.urls.put(page)
     else:
         return
    async def test_get_ratings_per_user(self):
        content = await self._load_movie_user_reviews()
        parser = Parser(content)
        ratings = parser.get_ratings_per_user()

        answer = [('127878', 10), ('78539', 10), ('42238', 1), ('70404', 9)]
        self.assertEqual(answer, ratings)
示例#3
0
class Crawler:
    """Crawler"""
    def __init__(self) -> None:
        self.downloader = Downloader()
        self.parser = Parser()

    def scrape(self, keyword: Optional[str] = None) -> bool:
        """scrape

        Summary:
                summary

        Args:
            keyword: Optional[str]

        Returns:
                {variable}: bool

        Raises:

        """
        if keyword:
            html = self.downloader.get_text_from_keyword(keyword)
            if html:
                self.parser.print_result_from_parser(html)
                return True
        return False

    def change_url(self):
        """change_url"""
示例#4
0
 def setUp(self):
     self.parser = Parser()
     self.test_soup = BeautifulSoup(
         '<!DOCTYPE html>\n<html>\n\n<head>\n <title>Cats and Dogs</title> \n<meta name="description" content="Page about cats and dogs"> \n <meta name="keywords" content="cats,dogs">\n</head><body><a href="www.dogs.com">Dogs</a><a href="www.cats.com">Cats</a></body></html>',
         'html.parser')
     self.bad_test_soup = BeautifulSoup('<!DOCTYPE html>\n<html>',
                                        'html.parser')
示例#5
0
    def _parse_content(self, url, content):
        if url in self._visited:
            return
        self._visited.add(url)
        self.logger.info(f'url {url} visited')

        parser = Parser(content)
        links = parser.get_links() - self._visited
        self.logger.info(f'{len(links)} links added')
        for link in links:
            self._push_url(link)

        if 'other/moviePoint' in url:
            ratings = self._parse_ratings_per_user(url, parser)
            if ratings:
                self._ratings.extend(ratings)
        elif 'moviedb/main' in url:
            metadata = self._parse_metadata(url, parser)
            if metadata:
                self._metadata.append(metadata)
            ratings = self._parse_ratings_per_movie(url, parser)
            if ratings:
                self._ratings.extend(ratings)
        elif 'moviedb/grade' in url:
            ratings = self._parse_ratings_per_movie(url, parser)
            if ratings:
                self._ratings.extend(ratings)
    async def test_get_ratings_per_movie_1(self):
        content = await self._load_movie_main()
        parser = Parser(content)
        ratings = parser.get_ratings_per_movie()

        answer = [('ckh5SQ==', 0), ('Q0s1Yk0=', 9), ('OHVFYTQ=', 10),
                  ('NGsxa0M=', 10), ('NHJ5aHM=', 10)]
        self.assertEqual(answer, ratings)
示例#7
0
 def __init__(self):
     try:
         self.crawler = Crawler()
         self.pgrpah = Graph()
         self.parser = Parser()
         pass
     except Exception as e:
         print("ERROR " + str(e))
         sys.exit(-1)
    async def test_get_ratings_per_movie_2(self):
        content = await self._load_movie_reviews()
        parser = Parser(content)
        ratings = parser.get_ratings_per_movie()

        answer = [('OEhrRm4=', 9), ('OEVQQnU=', 1), ('MkpxV2Y=', 1),
                  ('QmtPMWI=', 9), ('VU9ROA==', 7), ('dEdabQ==', 10),
                  ('NFB0NUY=', 6), ('M0tpUlA=', 10), ('QzF4Wnc=', 10),
                  ('OUMwUnA=', 7)]
        self.assertEqual(answer, ratings)
    async def test_get_metadata(self):
        content = await self._load_movie_main()
        parser = Parser(content)
        metadata = parser.get_metadata()

        self.assertTrue(metadata is not None)
        self.assertEqual('우리집 (2019)', metadata['title'])
        self.assertEqual('드라마/가족', metadata['genre'])
        self.assertEqual('한국', metadata['country'])
        self.assertEqual(92, metadata['running_time'])
    async def test_get_links(self):
        content = await self._load_movie_main()
        parser = Parser(content)
        links = parser.get_links()

        self.logger.info('# of links: %d', len(links))

        self.assertTrue(any(('movieId=128635' in link for link in links)))
        self.assertTrue(any(('personId=271829' in link for link in links)))
        self.assertTrue(any(('personId=518464' in link for link in links)))
示例#11
0
    def get_app_from_link(self):
        uid = int(re.search(r'\d+', self.link).group())
        start_page = requests.get(self.link)
        parser = Parser(start_page, uid)
        app = parser.parse()

        if app is 0:
            return 0

        return app
示例#12
0
class TestParser(unittest.TestCase):

	def setUp(self):
		self.parser = Parser("http://google.co.uk")
		self.urls = ["http://google.co.uk", "http://news.bbc.co.uk"]

	def tearDown(self):
		pass

	def test_get_start_tag(self):
		body = '<title>'
		self.parser.feed(body)
		self.assertEqual(self.parser.data['title'], None)

	def test_get_data(self):
		body = '<title>Hey'
		self.parser.feed(body)
		self.assertEqual(self.parser.data['title'], 'Hey')

	def test_get_links(self):
		body = "<h1>Hey</h1><a href='%s'>Link 1</a><a href='%s'>Link 2</a>" % (self.urls[0], self.urls[1])
		self.parser.feed(body)
		self.assertEqual(self.parser.data['urls'], self.urls)

	def test_ignores_tags(self):
		body = "<script>some js</script><style>some css</style><h1>hey</h1>"
		self.parser.feed(body)
		self.assertEqual(self.parser.data, { 'h1': 'hey', 'webpage_urls': [] })
示例#13
0
 def __init__(self,
              database_writer=DatabaseWriter(),
              database_reader=DatabaseReader(),
              parser=Parser()):
     self.database_writer = database_writer
     self.database_reader = database_reader
     self.parser = parser
示例#14
0
class ArticleSpider(CrawlSpider):
    name = 'article'
    allowed_domains = ['tuaw.com']
    start_urls = [
        "http://www.tuaw.com/about",
        "http://www.tuaw.com/editor/chris-rawson/page/101",
		"http://www.tuaw.com/editor/Mel-Martin/page/184"
    ]
    rules = (
        Rule(LinkExtractor(allow=(r'tuaw.com/[0-9]{4}/[0-9]{1,2}/[0-9]{1,2}', )), callback='crawlArticlePage', follow=True),
        Rule(LinkExtractor(allow=(r'\/editor\/', )), callback='crawlEditorPage', follow=True),
        Rule(LinkExtractor(deny=(r'(\/editor\/)|(tuaw.com/[0-9]{4}/[0-9]{1,2}/[0-9]{1,2})', )), callback='crawlPage', follow=True)
        #Rule(LinkExtractor(deny=visited_urls))
    )

    def __init__(self):
        CrawlSpider.__init__(self)
        self.selenium = webdriver.Firefox()
        self.parser = Parser(self.selenium)
        self.webpageLoadTimeoutInSeconds = 10

    def parse_start_url(self, response):
        return self.crawlPage(response)

    def crawlPage(self, response):
        url = response.url
        self.selenium.get(url)
        return WebDriverWait(self.selenium, self.webpageLoadTimeoutInSeconds)

    def crawlArticlePage(self, response):
        if self.parser.containsComments():
            self.crawlPage(response).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".fyre-widget")))
        else:
            self.crawlPage(response)
        article = self.parser.parseArticle(response.url)
        yield article

    def crawlEditorPage(self, response):
        self.crawlPage(response)
示例#15
0
class ParserTestCase(unittest.TestCase):
    def setUp(self):
        self.parser = Parser()
        with open(FILE_PATH + '/test.html', 'r') as file:
            self.html = file.read()
        self.titles = self.parser.parser_titles(self.html)

    def tearDown(self):
        pass

    def test_parser_titles(self):

        self.assertEqual(20, len(self.titles))

    def test_print_result(self):
        flag = self.parser.print_result(self.titles)

        self.assertTrue(flag)

    def test_print_result_from_parser(self):
        flag = self.parser.print_result_from_parser(self.html)

        self.assertTrue(flag)
示例#16
0
class ParserTests(unittest.TestCase):
    def setUp(self):
        self.__parser = Parser()

    def test_empty(self):
        result = self.__parser.parse('')
        self.assertEqual(len(result), 0)

    def test_sample_page(self):
        with codecs.open(os.path.join(os.path.dirname(__file__), 'vdm_sample.html'), 'r', 'utf-8') as f:
            data = f.read()
        result = self.__parser.parse(data)

        self.assertEqual(len(result), 13)

        post = result[0]
        self.assertEqual(post.author, 'Jbln!')
        self.assertEqual(post.date, datetime(2016, 1, 9, 16, 33, tzinfo = tz.gettz('Europe/Paris')))
        self.assertEqual(post.content, "Aujourd'hui, je reçois une lettre d'invitation à un mariage ! Celui de ma \"meilleure\" amie qui m'a piqué mon mec lorsque que je l'ai accueillie chez moi après une de ses ruptures. Petit plus, elle me demande d'être une de ses demoiselles d'honneur. VDM")

        post = result[1]
        self.assertEqual(post.author, 'ivegotnomoney')
        self.assertEqual(post.date, datetime(2016, 1, 9, 14, 12, tzinfo = tz.gettz('Europe/Paris')))
        self.assertEqual(post.content, "Aujourd'hui, comme depuis des années maintenant, mes parents tiennent toujours à la règle de \"celui qui obtient la fève rembourse la galette des rois\". Étant en repas de famille nombreuse, il y a donc trois galettes. J'ai eu les trois fèves et je suis une étudiante avec un budget très serré. VDM")
示例#17
0
class PyDGraph:
    def __init__(self):
        try:
            self.crawler = Crawler()
            self.pgrpah = Graph()
            self.parser = Parser()
            pass
        except Exception as e:
            print("ERROR " + str(e))
            sys.exit(-1)

    def generate(self):
        # print("dependency json: \n",json.dumps(self.crawler.get_dependency_list()))
        nodes = self.parser.parseDependencies(
            self.crawler.get_dependency_list())
        self.pgrpah.plot_graph(nodes)
示例#18
0
 def fill_disallow_urls(self, url: URL):
     parser = Parser(url)
     host = parser.host
     if host in self.seen_hosts:
         return
     self.seen_hosts.add(host)
     robots_txt_url = parser.host / 'robots.txt'
     robots_txt = requests.get(str(robots_txt_url)).text.lower()
     try:
         index = robots_txt.index('user-agent: *')
     except ValueError:
         return
     robots_txt = robots_txt[index::]
     robots_txt = robots_txt.split('\n')
     try:
         for string in robots_txt:
             if string.startswith('disallow'):
                 string = string.replace('*', '.+')
                 string = string.split(':')
                 self.disallow_urls.add(
                     re.compile(fr"{host}/{string[1][2::]}", re.IGNORECASE))
     except IndexError:
         pass
示例#19
0
 def __init__(self, url):
     self.url = url
     self.parser = Parser()
示例#20
0
 def __init__(self, mode):
     self.mode = mode
     self.store = self._set_orm()
     self.parser = Parser()
     self.links = self._set_links()
示例#21
0
class TestingParser(unittest.TestCase):
    def setUp(self):
        self.parser = Parser()
        self.test_soup = BeautifulSoup(
            '<!DOCTYPE html>\n<html>\n\n<head>\n <title>Cats and Dogs</title> \n<meta name="description" content="Page about cats and dogs"> \n <meta name="keywords" content="cats,dogs">\n</head><body><a href="www.dogs.com">Dogs</a><a href="www.cats.com">Cats</a></body></html>',
            'html.parser')
        self.bad_test_soup = BeautifulSoup('<!DOCTYPE html>\n<html>',
                                           'html.parser')

    def test_parser_is_instance_of_Parser(self):
        self.assertIsInstance(self.parser, Parser)

    def test_parse_webpage_content_returns_dictionary(self):
        run_parse_webpage_content = self.parser.parse_webpage_content(
            self.test_soup)
        self.assertEqual(
            run_parse_webpage_content, {
                "title": "Cats and Dogs",
                "description": "Page about cats and dogs",
                "keywords": "cats,dogs"
            })

    def test_parse_webpage_content_returns_empty_dictionary_if_values_are_empty(
            self):
        run_parse_webpage_content = self.parser.parse_webpage_content(
            self.bad_test_soup)
        self.assertEqual(run_parse_webpage_content, {})

    def test_parse_webpage_content(self):
        self.parser.find_webpage_title = MagicMock()
        self.parser.parse_webpage_content(self.test_soup)
        self.parser.find_webpage_title.assert_called_once_with(self.test_soup)

    def test_parse_webpage_content_calls_webpage_metadata_description(self):
        self.parser.find_webpage_metadata = MagicMock(return_value="keywords")
        self.parser.parse_webpage_content(self.test_soup)
        self.parser.find_webpage_metadata.assert_called_with(
            self.test_soup, 'keywords')

    def test_find_webpage_title_return_title(self):
        run_find_webpage_title = self.parser.find_webpage_title(self.test_soup)
        self.assertEqual(run_find_webpage_title, 'Cats and Dogs')

    def test_find_webpage_title_returns_empty_string_when_no_title(self):
        run_find_webpage_title = self.parser.find_webpage_title(
            self.bad_test_soup)
        self.assertEqual(run_find_webpage_title, '')

    def test_find_webpage_metadata_returns_description(self):
        run_find_webpage_metadata = self.parser.find_webpage_metadata(
            self.test_soup, 'description')
        self.assertEqual(run_find_webpage_metadata, 'Page about cats and dogs')

    def test_find_webpage_metadata_returns_empty_string_when_no_description(
            self):
        run_find_webpage_metadata = self.parser.find_webpage_metadata(
            self.bad_test_soup, 'description')
        self.assertEqual(run_find_webpage_metadata, '')

    def test_find_webpage_metadata_returns_keywords(self):
        run_find_webpage_metadata = self.parser.find_webpage_metadata(
            self.test_soup, 'keywords')
        self.assertEqual(run_find_webpage_metadata, 'cats,dogs')

    def test_find_webpage_metadata_returns_empty_string_when_no_keywords(self):
        run_find_webpage_metadata = self.parser.find_webpage_metadata(
            self.bad_test_soup, 'keywords')
        self.assertEqual(run_find_webpage_metadata, '')

    def test_check_empty_titles_and_descriptions_returns_true(self):
        title = ''
        description = ''
        self.assertTrue(
            self.parser.check_empty_titles_and_descriptions(
                title, description))

    def test_check_empty_titles_and_descriptions_returns_false(self):
        title = "The best website ever"
        description = "This is clearly the best website, you want to visit it"
        self.assertFalse(
            self.parser.check_empty_titles_and_descriptions(
                title, description))

    def test_parse_webpages_links_returns_an_array(self):
        run_parse_webpages_links = self.parser.parse_webpages_links(
            self.test_soup)
        self.assertIn("www.dogs.com", run_parse_webpages_links)
        self.assertIn("www.cats.com", run_parse_webpages_links)
示例#22
0
 def __init__(self) -> None:
     self.downloader = Downloader()
     self.parser = Parser()
示例#23
0
class FarsroidDataCrawler(BaseCrawler):
    def __init__(self, mode):
        self.mode = mode
        self.store = self._set_orm()
        self.parser = Parser()
        self.links = self._set_links()

    def _set_orm(self):
        if self.mode == 'app':
            return AppData()
        if self.mode == 'game':
            return GameData()

    def _set_links(self):
        if self.mode == 'app':
            return AppLink.load_links()
        if self.mode == 'game':
            return GameLink.load_links()

    def data_crawler(self, links, index_thread):
        while links.qsize():
            link = links.get()

            response = self.get(link)

            if response is not None:
                data = self.parser.parse(response, link, self.mode)

                if data is not None:
                    app_data = data[0]
                    count = data[1]

                    print(f'Thread: {index_thread}\t|\t'
                          f'Link: {link}\t|\t'
                          f'Count: {count}\t|\t')

                    self.store.save_app(app_data)
                    links.task_done()

                else:
                    links.task_done()
            else:
                links.task_done()

    def run_crawler(self):
        threads = list()

        links_queue = Queue()

        for link in self.links:
            links_queue.put(link.link)

        for t in range(1, 21):
            thread = Thread(target=self.data_crawler, args=(links_queue, t))
            threads.append(thread)
            thread.start()

        for thread in threads:
            thread.join()

        print('All Task Done...')
示例#24
0
 def setUp(self):
     self.__parser = Parser()
示例#25
0
 def __init__(self):
     CrawlSpider.__init__(self)
     self.selenium = webdriver.Firefox()
     self.parser = Parser(self.selenium)
     self.webpageLoadTimeoutInSeconds = 10
示例#26
0
 def test_get_url_with_urls(self):
     with open('test.html', 'r') as test:
         text = test.read()
         test_url = Parser(URL('https://t/'))
         self.assertEqual(len(test_url.get_urls(text)), 4)
示例#27
0
 def setUp(self):
     self.parser = Parser()
     with open(FILE_PATH + '/test.html', 'r') as file:
         self.html = file.read()
     self.titles = self.parser.parser_titles(self.html)
示例#28
0
	def setUp(self):
		self.parser = Parser("http://google.co.uk")
		self.urls = ["http://google.co.uk", "http://news.bbc.co.uk"]