class SpiderMain(object): def __init__(self): self.manager = URLManager() self.down = Download() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): content = self.down.download(root_url) movie_ids = self.parser.parse_urls(content) count = 0 for mid in movie_ids: if count > 10: break movie_link = '''http://service.library.mtime.com/Movie.api?\ Ajax_CallBack=true\ &Ajax_CallBackType=Mtime.Library.Services\ &Ajax_CallBackMethod=GetMovieOverviewRating\ &Ajax_CrossDomain=1\ &Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F{0}%2F\ &t={1}\ &Ajax_CallBackArgument0={2}\ '''.format(mid, datetime.datetime.now().strftime("%Y%m%d%H%M%S%f"), mid) res = self.down.download(movie_link.replace(' ', '')) self.parser.parser_json(res) count += 1 self.output.store_data(self.parser.items) self.output.close_connect()
def crawl(self): while not self.queue.empty(): (url, attempt, depth) = self.queue.get() print((url, attempt, depth)) if self.is_outer_url(url): self.outer_link_counter += 1 elif self.is_subdomain_url(url): self.subdomain_set.add(self.get_subdomain_name(url)) else: self.inner_link_counter += 1 if depth >= self.max_depth: continue content = self.get_page(url) if not content: if attempt >= self.max_attempts: self.error_counter += 1 continue else: self.queue.put((url, attempt+1, depth)) continue parser =HtmlParser(content) url_list = parser.get_links() for u in url_list: if len(u) < 1: continue u = self.make_full_link(u) if u not in self.reached_urls: self.reached_urls[u] = depth+1 self.queue.put((u, 0, depth+1)) print(self.reached_urls) print(self.queue.qsize()) print("Subdomains:", self.subdomain_set) print("Inner links count:", self.inner_link_counter) print("Outer links count:", self.outer_link_counter) print("Unavailable pages count:", self.error_counter)
class ArticleReader: ''' Read article content using suitable parsers. ''' def __init__(self, **kwargs): self.config = kwargs self.wtparser = WikitextParser(**kwargs) self.hparser = HtmlParser(**kwargs) def get_seed_links(self, text, targets=None): return self.wtparser.get_seed_links(text, targets) def get_links(self, title, text, html): links = self.wtparser.get_links(title, text) if self.config['transcludes']['enabled']: transcludes = self.wtparser.get_transcludes(text) transcludes |= self.hparser.get_transcludes(html) else: transcludes = set() return links, transcludes
def test_img_alt(self): parser = HtmlParser( "<img alt='some text'></img>" ) self.assertEqual(parser.get_text(), "some text")
def test_empty_page(self): parser = HtmlParser("") self.assertEqual(parser.get_links(), []) self.assertEqual(parser.get_text(), '')
def test_script(self): parser = HtmlParser("<script type='text/javascript'>var a = 1</script>") self.assertEqual(parser.get_text(), '')
def test_numeric_character_reference(self): parser = HtmlParser( "ΣΣΣΣΣ" ) self.assertEqual(parser.get_text(), "¦²¦²¦²¦²¦²")
def test_character_entity_reference(self): parser = HtmlParser( "&<>" ) self.assertEqual(parser.get_text(), "&<>")
def test_meta_wrong(self): parser = HtmlParser( "<meta content='text/html; charset=utf-8' http-equiv='123'>" ) self.assertEqual(parser.get_meta(), ["text/html; charset=utf-8"])
def test_new_line_test(self): parser = HtmlParser( "<div><p>some text</p></div><div><a>another text</a></div>" ) self.assertEqual(parser.get_text(), "some text\nanother text")
def test_get_several_links(self): parser = HtmlParser( "<div><a href='http://abc.abc/'>text</a></div><a href='http://def.def/'></a><div><A HREF='q.html'>text</a></div>" ) self.assertEqual(parser.get_links(), ['http://abc.abc/', 'http://def.def/', 'q.html'])
def test_one_link(self): parser = HtmlParser( "<a href='http://abc.abc/'>text</a>" ) self.assertEqual(parser.get_links(), ['http://abc.abc/'])
def test_text_no_tags(self): parser = HtmlParser( "some text" ) self.assertEqual(parser.get_text(), "some text")
from common import Download from parsers import HtmlParser dl = Download() parse = HtmlParser() content = dl.download('http://theater.mtime.com/China_Beijing/') res = parse._parse_movies(content) print(res)
def __init__(self, **kwargs): self.config = kwargs self.wtparser = WikitextParser(**kwargs) self.hparser = HtmlParser(**kwargs)
def __init__(self): self.manager = URLManager() self.down = Download() self.parser = HtmlParser() self.output = DataOutput()