def load(id,tm,url,html,encode, xpaths): parser = HtmlParser(html,encode) parser.parse() db_sql = "insert into job_detail(url,src_desc,type,title,\ keywords,department,job_require,job_duty,\ job_welfare,label,company,company_desc,\ logo,salary,work_experience,\ edu, field,location,head_count,pub_time) values(" jd = page_pb2.JobDescription() js ="{\"pub_tm\":\"" + tm + "\"," js = js + "\"url\":\"" + url + "\"," for key in xpaths: # print "[ON]handle " + key xpath=xpaths.get(key) elements = parser.get_element_by_xpath(xpath,encode) if (len(elements) == 0): print "[ERR] " + key continue value = elements[0][2].encode('utf-8') js += "\"" + key + "\":\"" + value + "\"," # set_pb(jd,key,value) fp=open("./data/"+id+".dat",'w') fp.write(js.rstrip(',') + "}") fp.close()
def load(html, encode, xpaths): parser = HtmlParser(html, encode) parser.parse() for key in xpaths: xpath = xpaths.get(key) elements = parser.get_element_by_xpath(xpath, encode) value = elements[0][2].encode('utf-8')
def _toString(self): htmlParser = HtmlParser('https://www.worldometers.info/coronavirus/') htmlParser.parse() timeStr = time.strftime("%d %b %Y %H:%M:%S", time.gmtime()) text = "Статистика зараженных на " + timeStr + "\nЗараженных: " + htmlParser.getContent()[0] + "\nУмерших: " + htmlParser.getContent()[1] + "\nВыздоровевших: " + htmlParser.getContent()[2] return text
def load(id,html,encode, xpaths): parser = HtmlParser(html,encode) parser.parse() jd = page_pb2.JobDescription() js ="{"; for key in xpaths: # print "[ON]handle " + key xpath=xpaths.get(key) elements = parser.get_element_by_xpath(xpath,encode) if (len(elements) == 0): print "[ERR] " + key continue value = elements[0][2].encode('utf-8') js += "\"" + key + "\":\"" + value + "\"," # set_pb(jd,key,value) fp=open("./data/"+id+".dat",'w') fp.write(js.rstrip(',') + "}") fp.close()
def crawl(init_url): url_pool = UrlManager() downloader = Downloader() parser = HtmlParser() outputer = Outputer() temp_url = init_url while temp_url: driver = downloader.download(temp_url) content, temp_url = parser.parse(driver) outputer.write(content) outputer.close()
def parse_feed(self, feed): 'Extract list of articles from the feed.' articles = [] htmlparser = HtmlParser() for e in feed.entries[:1]: # read just the first entry while debugging article = Article(source=e.author, title=e.title, link=e.link) content = htmlparser.parse(e.link) article.content = re.sub(r' -.*$', '', content) article.save() # and associated word frequencies articles.append(article) return articles
def load(id, html, encode, xpaths): parser = HtmlParser(html, encode) parser.parse() jd = page_pb2.JobDescription() js = "{" for key in xpaths: # print "[ON]handle " + key xpath = xpaths.get(key) elements = parser.get_element_by_xpath(xpath, encode) if (len(elements) == 0): print "[ERR] " + key continue value = elements[0][2].encode('utf-8') js += "\"" + key + "\":\"" + value + "\"," # set_pb(jd,key,value) fp = open("./data/" + id + ".dat", 'w') fp.write(js.rstrip(',') + "}") fp.close()
def diff_html_from_file(cls, fileName1, fileName2, encode): '''get different elements btw. two html files ''' if fileName1 == "" or fileName2 == "": print "class differ : function :diff_html_from_file() fileName1 or fileName2 is null" return [] html_str1 = file(fileName1, "rb").read() html_Parser1 = HtmlParser(html_str1, encode) elements1 = html_Parser1.parse() html_Parser1.saveElementsToFile(elements1, "./tmp1.txt") html_str2 = file(fileName2, "rb").read() html_Parser2 = HtmlParser(html_str2, encode) elements2 = html_Parser2.parse() html_Parser2.saveElementsToFile(elements2, "./tmp2.txt") diffs = cls.diff_txt_from_file("tmp1.txt", "tmp2.txt") return diffs
def diff_html_from_file(cls, fileName1, fileName2,encode): '''get different elements btw. two html files ''' if fileName1=="" or fileName2=="": print "class differ : function :diff_html_from_file() fileName1 or fileName2 is null" return [] html_str1 = file(fileName1,"rb").read() html_Parser1 = HtmlParser(html_str1,encode) elements1 = html_Parser1.parse() html_Parser1.saveElementsToFile(elements1,"./tmp1.txt") html_str2 = file(fileName2,"rb").read() html_Parser2 = HtmlParser(html_str2,encode) elements2 = html_Parser2.parse() html_Parser2.saveElementsToFile(elements2,"./tmp2.txt") diffs = cls.diff_txt_from_file("tmp1.txt", "tmp2.txt") return diffs
class Harvester(): def __init__(self, db, filename): 'Harvest articles from the list of feeds in filename.' self.db = db self.filename = filename self.htmlparser = HtmlParser() feedlist = self.read_feed_list(filename) self.articles = self.parse_feedlist(feedlist) def read_feed_list(self, filename): ''' Read the feed list from a CSV file. The first item of each line is the URL to an RSS feed. ''' feedlist = [] reader = csv.reader(open(filename, 'rb')) for line in reader: feedlist.append(line) return feedlist def parse_feed(self, entry): 'Extract list of articles from the feed.' articles = [] (url, publisher, publisher_location) = entry try: c = urlopen(url) except URLError: print 'Failed to fetch ' + url feed = feedparser.parse(c) # for e in feed.entries[:1]: # read just the first entry while debugging for e in feed.entries: image_link = None image_type = None for link in e.links: if link['rel'] == 'enclosure': image_link = link['href'] image_type = link['type'] article = Article( publisher=publisher, publisher_location=publisher_location, published_date=e.updated_parsed, title=e.title, link=e.link, image_link=image_link, image_type=image_type) content = self.htmlparser.parse(e.link) m = re.search(r'-\s*([a-zA-Z]+(,?\s+[a-zA-Z]+){0,6})$', content) if m: article.source = m.group(1) article.content = re.sub(r'(\\n)?\s*-\s*([a-zA-Z]+(,?\s+[a-zA-Z]+){0,6})$', '', content) article.store(self.db) # put article and word frequencies into couchdb articles.append(article) return articles def parse_feedlist(self, feedlist): 'Parse the RSS feeds.' articles = [] for entry in feedlist: articles += self.parse_feed(entry) return articles def __str__(self): print self.filename