def __init__(self, db, filename): 'Harvest articles from the list of feeds in filename.' self.db = db self.filename = filename self.htmlparser = HtmlParser() feedlist = self.read_feed_list(filename) self.articles = self.parse_feedlist(feedlist)
class SpiderManager(object): def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): content = self.downloader.download(root_url) #with open("content.html", 'wb') as f: # f.write(content.encode('utf-8')) urls = self.parser.parser_url(root_url, content) print(urls) for url in urls: try: t = time.strftime("%Y%m%d%H%M%S3282", time.localtime()) rank_url = 'http://service.library.mtime.com/Movie.api' \ '?Ajax_CallBack=true' \ '&Ajax_CallBackType=Mtime.Library.Services' \ '&Ajax_CallBackMethod=GetMovieOverviewRating' \ '&Ajax_CrossDomain=1' \ '&Ajax_RequestUrl=%s' \ '&t=%s' \ '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1]) rank_content = self.downloader.download(rank_url) data = self.parser.parser_json(rank_url, rank_content) self.output.store_data(data) except Exception as e: print('Crawl Failed!!!') self.output.output_end() print('Crawl Finish!')
def load(id,tm,url,html,encode, xpaths): parser = HtmlParser(html,encode) parser.parse() db_sql = "insert into job_detail(url,src_desc,type,title,\ keywords,department,job_require,job_duty,\ job_welfare,label,company,company_desc,\ logo,salary,work_experience,\ edu, field,location,head_count,pub_time) values(" jd = page_pb2.JobDescription() js ="{\"pub_tm\":\"" + tm + "\"," js = js + "\"url\":\"" + url + "\"," for key in xpaths: # print "[ON]handle " + key xpath=xpaths.get(key) elements = parser.get_element_by_xpath(xpath,encode) if (len(elements) == 0): print "[ERR] " + key continue value = elements[0][2].encode('utf-8') js += "\"" + key + "\":\"" + value + "\"," # set_pb(jd,key,value) fp=open("./data/"+id+".dat",'w') fp.write(js.rstrip(',') + "}") fp.close()
def __init__(self, url, number_of_threads=20, allowed_urls=[], blocked_urls=[], basic_auth=(), depth=-1): self.url = url self.number_of_threads = number_of_threads self.allowed_urls = allowed_urls # self.blocked_urls = blocked_urls self.lost_url = set() self.basic_auth = basic_auth self.depth = depth self.crawl = True self.visited = {} self.general_visited = set() self.unvisited = set() self.general_unvisited = {self.url} self.fetched_url_record = dict() self.csv_table = CsvFormat([ "url", "status code", "title", "keyword", "description", "h1", "h2", "h3", "h4", "h5", "h6", "index", "open tags", "external links", "h_tag_format" ]) self.downloaded_pages = {} self.record = [] self.url_parser = UrlParser(url) self.parser = HtmlParser() self.filemanager = FileManager()
def load(html, encode, xpaths): parser = HtmlParser(html, encode) parser.parse() for key in xpaths: xpath = xpaths.get(key) elements = parser.get_element_by_xpath(xpath, encode) value = elements[0][2].encode('utf-8')
def _toString(self): htmlParser = HtmlParser('https://www.worldometers.info/coronavirus/') htmlParser.parse() timeStr = time.strftime("%d %b %Y %H:%M:%S", time.gmtime()) text = "Статистика зараженных на " + timeStr + "\nЗараженных: " + htmlParser.getContent()[0] + "\nУмерших: " + htmlParser.getContent()[1] + "\nВыздоровевших: " + htmlParser.getContent()[2] return text
def crawl(init_url): url_pool = UrlManager() downloader = Downloader() parser = HtmlParser() outputer = Outputer() temp_url = init_url while temp_url: driver = downloader.download(temp_url) content, temp_url = parser.parse(driver) outputer.write(content) outputer.close()
def parse_feed(self, feed): 'Extract list of articles from the feed.' articles = [] htmlparser = HtmlParser() for e in feed.entries[:1]: # read just the first entry while debugging article = Article(source=e.author, title=e.title, link=e.link) content = htmlparser.parse(e.link) article.content = re.sub(r' -.*$', '', content) article.save() # and associated word frequencies articles.append(article) return articles
def content(self): content = u'' epub_reader = EpubReader(self._filename) epub = epub_reader.load() for item in epub.items: if isinstance(item, EpubHtml): html_parser = HtmlParser(html=item.get_body_content()) content += html_parser.content() + '\n' return content
def load(id,html,encode, xpaths): parser = HtmlParser(html,encode) parser.parse() jd = page_pb2.JobDescription() js ="{"; for key in xpaths: # print "[ON]handle " + key xpath=xpaths.get(key) elements = parser.get_element_by_xpath(xpath,encode) if (len(elements) == 0): print "[ERR] " + key continue value = elements[0][2].encode('utf-8') js += "\"" + key + "\":\"" + value + "\"," # set_pb(jd,key,value) fp=open("./data/"+id+".dat",'w') fp.write(js.rstrip(',') + "}") fp.close()
class debug_HtmlParser: @dec def __init__(self): self.html=''' <html> <head> <title>hello world</title> </head> <body> 你好<b>世界</b> <h1>h1这是</h1> <a href="http://www.cau.edu.cn">link哈啊 1</a> <a href="http://www.cau.edu.cn/hello">link 2</a> <a href="http://www.cau.edu.cn/index">link 3</a> </body> </html> ''' self.homeUrls = [ 'http://www.cau.edu.cn', 'http://www.google.com.hk', 'http://www.baidu.com', ] self.urlparser = UrlParser(self.homeUrls) self.htmlparser = HtmlParser(self.urlparser) @dec def init(self): self.htmlparser.init(self.html) @dec def transcode(self): self.htmlparser.transcode(self.html) @dec def getLinks(self): print self.htmlparser.getLinks() @dec def getSrcs(self): print self.htmlparser.getSrcs() def transXML(self): print self.htmlparser.d.text() strr = self.htmlparser.transXML("http://www.cau.edu.cn") f = open('text.txt', 'w') f.write(strr) f.close() print chardet.detect(strr) print strr
def diff_html_from_file(cls, fileName1, fileName2, encode): '''get different elements btw. two html files ''' if fileName1 == "" or fileName2 == "": print "class differ : function :diff_html_from_file() fileName1 or fileName2 is null" return [] html_str1 = file(fileName1, "rb").read() html_Parser1 = HtmlParser(html_str1, encode) elements1 = html_Parser1.parse() html_Parser1.saveElementsToFile(elements1, "./tmp1.txt") html_str2 = file(fileName2, "rb").read() html_Parser2 = HtmlParser(html_str2, encode) elements2 = html_Parser2.parse() html_Parser2.saveElementsToFile(elements2, "./tmp2.txt") diffs = cls.diff_txt_from_file("tmp1.txt", "tmp2.txt") return diffs
def load(id, html, encode, xpaths): parser = HtmlParser(html, encode) parser.parse() jd = page_pb2.JobDescription() js = "{" for key in xpaths: # print "[ON]handle " + key xpath = xpaths.get(key) elements = parser.get_element_by_xpath(xpath, encode) if (len(elements) == 0): print "[ERR] " + key continue value = elements[0][2].encode('utf-8') js += "\"" + key + "\":\"" + value + "\"," # set_pb(jd,key,value) fp = open("./data/" + id + ".dat", 'w') fp.write(js.rstrip(',') + "}") fp.close()
def iterate_folder(fp, word_to_tail_map, document_len_map, file_path): document_list = [document_name for document_name in listdir(file_path) if isfile(join(file_path, document_name)) and document_name.isdigit()] word_info = {} for document in sorted(document_list, key=lambda x:int(x)): print "doing for ", document if getsize(file_path+document) > 3000000: continue signal.alarm(3) # Ten seconds try: html_parser = HtmlParser(file_path,document,True,True) word_list = html_parser.get_all_words() except Exception , e: log_file_to_check = open("log_file_done_tillYY","a") log_file_to_check.write("Time out for %s\n"%(document)) log_file_to_check.close() continue signal.alarm(0) word_to_position_map = {} current_position = 1 for word in word_list: if word not in word_to_position_map: word_to_position_map[word] = [] word_to_position_map[word].append(current_position) current_position += 1 document_len_map[int(document)] = current_position - 1 for word in word_to_position_map: if word not in word_info: word_info[word] = [] word_info[word].append((int(document),word_to_position_map[word]))
def main(): # initialize argument parser parser = argparse.ArgumentParser() # add arguments parser.add_argument('url') parser.add_argument('keyword') # get arguments args = parser.parse_args() # set keyword and url from arguments keyword = args.keyword url = args.url # do a get request and get html from url response = do_request(url) # check if keyword is in response if keyword in response: print(Fore.BLUE + '==>' + Fore.RESET + ' {}'.format(url)) results = process_source(response, keyword) # initialize html parser parser = HtmlParser() # parse links from parser links = parser.feed(response) # iterate through collected links for link in links: # get the css or js file behind the links response = do_request(link) # check if keyword is in css or js file if keyword in response: print(Fore.BLUE + '==>' + Fore.RESET + ' {}'.format(link)) results = process_source(response, keyword)
class MySpider(object): def __init__(self, root_url): self.parser = HtmlParser() self.storage = DataStore() self._get_root_urls(root_url) def _get_root_urls(self, root_url): if os.path.exists('job_class.json'): pass else: new_urls = self.parser.get_url(root_url) self.storage.local_store(new_urls, 'job_class.json') #存储要爬取的行业类别url def joburl_init(self, pagenum, path='job_class.json'): root_urls = self.storage.load_data(path) jobs_dict = {} for i in pagenum: for list in root_urls: jobs_dict[list + str(i)] = root_urls[list] + str(i) #构造要爬取的网址链接 self.storage.local_store(jobs_dict, 'job_page_url.json') #存储构造好的网址链接 def company_url(self, path='job_page_url.json'): company_urls = self.storage.load_data(path) company_dicts = {} url_get = 0 #已获取的网址总数 for company_info_url in company_urls: print("待爬取的行业网址总数:", len(company_urls) - url_get) url_get += 1 url = company_urls[company_info_url] company_dicts.update(self.parser.getcompany_url(url)) self.storage.local_store(url, 'job_page_url_old.json') #存储已爬取的网址 self.storage.local_store(company_dicts, 'company_info_url_new.json') #存储公司信息的URL def company_info(self, path='company_info_url_new.json'): company_info_urls = self.storage.load_data(path) url_get = 0 #以获取的公司信息网址总数 for company_name in company_info_urls: print("待爬取的公司信息网址总数:", len(company_info_urls) - url_get) url_get += 1 url = company_info_urls[company_name] self.parser.getcompany_info(company_name, url) self.storage.local_store( url, 'compang_info_url_old.json') #存储以爬取的存储公司信息URL #从上次断点出重新开始获取公司信息 def grab_increment(self): new_urls = self.storage.load_data('company_info_url_new.json') old_urls = self.storage.load_data('compang_info_url_old.json') for company_name in new_urls: new_url = new_urls[company_name] if new_url not in old_urls: self.parser.getcompany_info(company_name, url) self.storage.local_store( url, 'compang_info_url_old.json') # 存储以爬取的存储公司信息URL
def diff_html_from_file(cls, fileName1, fileName2,encode): '''get different elements btw. two html files ''' if fileName1=="" or fileName2=="": print "class differ : function :diff_html_from_file() fileName1 or fileName2 is null" return [] html_str1 = file(fileName1,"rb").read() html_Parser1 = HtmlParser(html_str1,encode) elements1 = html_Parser1.parse() html_Parser1.saveElementsToFile(elements1,"./tmp1.txt") html_str2 = file(fileName2,"rb").read() html_Parser2 = HtmlParser(html_str2,encode) elements2 = html_Parser2.parse() html_Parser2.saveElementsToFile(elements2,"./tmp2.txt") diffs = cls.diff_txt_from_file("tmp1.txt", "tmp2.txt") return diffs
class Spiderman(object): def __init__(self): self.manage = UrlManager() self.parser = HtmlParser() self.downloader = Htmldownloader() self.output = DataOutput() def crawl(self,root_url): self.manage.add_new_url(root_url) print(len(self.manage.new_urls)) while(self.manage.has_new_url() and self.manage.old_url_size() < 100): try: new_url = self.manage.get_new_url() html = self.downloader.download(new_url) new_urls,data = self.parser.parser(new_url,html) self.manage.add_new_urls(new_urls) self.output.store_data(data=data) print('已经抓取%s个链接' % self.manage.old_url_size()) except: print('crawl Failed') self.output.output_html()
def __init__(self): self.html=''' <html> <head> <title>hello world</title> </head> <body> 你好<b>世界</b> <h1>h1这是</h1> <a href="http://www.cau.edu.cn">link哈啊 1</a> <a href="http://www.cau.edu.cn/hello">link 2</a> <a href="http://www.cau.edu.cn/index">link 3</a> </body> </html> ''' self.homeUrls = [ 'http://www.cau.edu.cn', 'http://www.google.com.hk', 'http://www.baidu.com', ] self.urlparser = UrlParser(self.homeUrls) self.htmlparser = HtmlParser(self.urlparser)
class Harvester(): def __init__(self, db, filename): 'Harvest articles from the list of feeds in filename.' self.db = db self.filename = filename self.htmlparser = HtmlParser() feedlist = self.read_feed_list(filename) self.articles = self.parse_feedlist(feedlist) def read_feed_list(self, filename): ''' Read the feed list from a CSV file. The first item of each line is the URL to an RSS feed. ''' feedlist = [] reader = csv.reader(open(filename, 'rb')) for line in reader: feedlist.append(line) return feedlist def parse_feed(self, entry): 'Extract list of articles from the feed.' articles = [] (url, publisher, publisher_location) = entry try: c = urlopen(url) except URLError: print 'Failed to fetch ' + url feed = feedparser.parse(c) # for e in feed.entries[:1]: # read just the first entry while debugging for e in feed.entries: image_link = None image_type = None for link in e.links: if link['rel'] == 'enclosure': image_link = link['href'] image_type = link['type'] article = Article( publisher=publisher, publisher_location=publisher_location, published_date=e.updated_parsed, title=e.title, link=e.link, image_link=image_link, image_type=image_type) content = self.htmlparser.parse(e.link) m = re.search(r'-\s*([a-zA-Z]+(,?\s+[a-zA-Z]+){0,6})$', content) if m: article.source = m.group(1) article.content = re.sub(r'(\\n)?\s*-\s*([a-zA-Z]+(,?\s+[a-zA-Z]+){0,6})$', '', content) article.store(self.db) # put article and word frequencies into couchdb articles.append(article) return articles def parse_feedlist(self, feedlist): 'Parse the RSS feeds.' articles = [] for entry in feedlist: articles += self.parse_feed(entry) return articles def __str__(self): print self.filename
def __init__(self, root_url): self.parser = HtmlParser() self.storage = DataStore() self._get_root_urls(root_url)
def __init__(self): self.manage = UrlManager() self.parser = HtmlParser() self.downloader = Htmldownloader() self.output = DataOutput()
def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()
class Spider: def __init__(self, url, number_of_threads=20, allowed_urls=[], blocked_urls=[], basic_auth=(), depth=-1): self.url = url self.number_of_threads = number_of_threads self.allowed_urls = allowed_urls # self.blocked_urls = blocked_urls self.lost_url = set() self.basic_auth = basic_auth self.depth = depth self.crawl = True self.visited = {} self.general_visited = set() self.unvisited = set() self.general_unvisited = {self.url} self.fetched_url_record = dict() self.csv_table = CsvFormat([ "url", "status code", "title", "keyword", "description", "h1", "h2", "h3", "h4", "h5", "h6", "index", "open tags", "external links", "h_tag_format" ]) self.downloaded_pages = {} self.record = [] self.url_parser = UrlParser(url) self.parser = HtmlParser() self.filemanager = FileManager() def start(self): self.fetch_html() while len(self.general_visited) < len( self.general_unvisited) and self.crawl == True: self.fetch_html() def fetch_html(self): url = self.get_url() if url in self.general_visited or not url: return res = self.get_html(url) if res.status_code >= 500: self.add_to_visited(url, 500) return False elif res.status_code >= 400: self.save_formated_data(res, url) self.add_to_visited(url, 400) elif res.status_code >= 300: if res.history: if self.url_parser.domain not in res.url: return False elif res.status_code >= 200: self.save_formated_data(res, url) self.add_to_visited(url, 200) def save_formated_data(self, response, current_url): html = BeautifulSoup(response.content, "lxml") self.csv_table.create_row('data') h_tags = self.parser.get_all_h(html) update = { "url": current_url, "status code": response.status_code, "title": self.parser.get_title(html), "keyword": self.parser.get_meta_keyword(html), "description": self.parser.get_meta_description(html), "h1": self.parser.get_htag("h1", h_tags), "h2": self.parser.get_htag("h2", h_tags), "h3": self.parser.get_htag("h3", h_tags), "h4": self.parser.get_htag("h4", h_tags), "h5": self.parser.get_htag("h5", h_tags), "h6": self.parser.get_htag("h6", h_tags), "index": self.parser.get_meta_index(html), "open tags": self.find_open_tags(response.text), "external links": self.parser.get_broken_a_tags(response.text, self.url_parser.domain, current_url), "h_tag_format": self.parser.tag_structure(response.text), } if response.status_code >= 400: update["status code"] = str(update["status code"]) for fetched_page_url, fetched_url_list in self.fetched_url_record.items( ): if current_url in fetched_url_list: update["status code"] += f" {fetched_page_url}にあります、\n" self.csv_table.update_row('data', update) self.csv_table.add_row_to_table('data') fetched_urls = self.parser.get_url(html, self.url_parser.domain, current_url) self.add_to_unvisited(current_url, fetched_urls) def get_url(self): if not self.unvisited: self.unvisited = self.general_unvisited - self.general_visited return self.unvisited.pop() return self.unvisited.pop() def add_to_visited(self, key, *args): if key not in self.visited and args: self.visited[key] = list(args) self.general_visited.add(key) def add_to_unvisited(self, url, fetched_urls): self.fetched_url_record[url] = fetched_urls self.general_unvisited.update(fetched_urls) def find_open_tags(self, html): open_tag_finder = OpenTagFinder() open_tag_finder.feed(html) open_tag_finder.reset() open_tags = open_tag_finder.get_open_tags() return open_tags def get_html(self, url): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95' } try: if self.basic_auth: return requests.get(url, headers=headers, auth=HTTPBasicAuth(self.basic_auth[0], self.basic_auth[1]), timeout=5.0) else: return requests.get(url, headers=headers, timeout=80.0) except requests.exceptions.RequestException as e: print(e) self.filemanager.save_to_log(f"{e} in url {url}") return
import requests from bs4 import BeautifulSoup from htmlparser import HtmlParser from urlparser import UrlParser from time import sleep import codecs import json import pandas as pd visited = set() unvisited = set() domain = 'www.motoji.co.jp' siteUrl = f"https://{domain}/" praser_url = UrlParser(siteUrl) parser_html = HtmlParser() DATA = [] def get_res(url): headers_pc = {'User-Agent': 'robot wpmake'} try: res = requests.get(url, headers=headers_pc, timeout=5.0, allow_redirects=False) return res except requests.exceptions.RequestException as e: print(e) return False def update_data(url, status_code): DATA.append({"url": url, "status_code": status_code})