def checkerFunction(myInput): today = datetime.date.today() try: google1 = 'http://www.google.com/search?hl=en&q=' google2 = '%20privacy%20policy&btnI=1' keyword = myInput url = google1 + keyword + google2 r = requests.get(url, allow_redirects=False) url = r.headers['location'] except Exception as e: return myFullPath = "./sandbox/db/" + keyword if not os.path.exists("./sandbox"): os.makedirs("./sandbox") if not os.path.exists("./sandbox/db/"): os.makedirs("./sandbox/db/") if not os.path.exists(myFullPath): os.makedirs(myFullPath) filename = keyword + "." + str(today) filetowrite = myFullPath + "/" + filename fileExist = os.path.isfile(filetowrite) if (url == None): return html = urllib.urlopen(url).read() readable_article = Document(html).summary() tempFileMade = False originalFileMade = False if(fileExist): filetowrite = filetowrite + ".tmp." f = open(filetowrite, 'w') writeThis = str(readable_article.encode('ascii', 'ignore')) f.write(writeThis) f.close tempFileMade = True else: f = open(filetowrite, 'w') writeThis = str(readable_article.encode('ascii', 'ignore')) f.write(writeThis) f.close originalFileMade = True hashedmd5 = hashlib.md5(readable_article.encode('ascii', 'ignore')) hashedArticle = hashedmd5.hexdigest() return hashedArticle
def get_announcement_body(url): now = datetime.datetime.now() resp = ["","","","","",""] images = [] html = br.open(url).read() readable_announcement = Document(html).summary() readable_title = Document(html).title() soup = BeautifulSoup(readable_announcement, "lxml") final_announcement = soup.text links = soup.findAll('img', src=True) for lin in links: li = urlparse.urljoin(url,lin['src']) images.append( li) resp[0] = str(final_announcement.encode("ascii","ignore")) resp[1] = str(readable_title.encode("ascii","ignore")) resp[2] = str(now.month)+" "+str(now.day)+" "+str(now.year)+"-"+str(now.hour)+":"+str(now.minute)+":"+str(now.second) resp[3] = url resp[4] = url resp[5] = "" #insertDB(resp) #print "inserted resp" title_article = [] title_article.append(final_announcement) title_article.append(readable_title) title_article.append(images) return title_article
def extrat_html_document(url): try: print "extrat_html_document" user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} r = urllib2.Request(url, headers=headers) socket = urllib2.urlopen(r, timeout=1) url = socket.geturl() html = socket.read() #block_url pass for bl_url in block_url: if len(url.split(bl_url)) > 1: summary = "block" return summary for ext_url in exception_url: if len(url.split(ext_url)) > 1: readable_title = Document(html).short_title() summary = readable_title.encode('utf-8') _file.write(summary + '\n') return summary readable_article = Document(html).summary() readable_title = Document(html).short_title() summary = readable_title.encode('utf-8') + readable_title.encode( 'utf-8') print "soup start" soup = BeautifulSoup(readable_article.replace("br/", "p"), "html.parser") print "summary:" for s in soup("p"): summary += str(s.encode('utf-8')) # summary += readable_article.encode('utf-8') except Exception: _file.write('extrat_html_document Failed URL : ' + url + '\n') summary = "Failed Get data" return summary
def extrat_html_document(url): try : print "extrat_html_document" user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } r = urllib2.Request(url, headers=headers) socket = urllib2.urlopen(r,timeout = 1) url = socket.geturl() html = socket.read() #block_url pass for bl_url in block_url: if len(url.split(bl_url)) > 1: summary="block" return summary for ext_url in exception_url: if len(url.split(ext_url)) > 1: readable_title = Document(html).short_title() summary = readable_title.encode('utf-8') _file.write(summary+'\n') return summary readable_article = Document(html).summary() readable_title = Document(html).short_title() summary = readable_title.encode('utf-8') + readable_title.encode('utf-8') print "soup start" soup = BeautifulSoup(readable_article.replace("br/","p"),"html.parser") print "summary:" for s in soup("p"): summary += str(s.encode('utf-8')) # summary += readable_article.encode('utf-8') except Exception: _file.write('extrat_html_document Failed URL : ' + url + '\n') summary = "Failed Get data" return summary
def get_content(self, url): rt_result = [] dr = re.compile(r'<[^>]+>', re.S) html = urllib.urlopen(url).read() cur_title = Document(html).short_title().replace(' ', '') readable_article = Document(html).summary() print readable_article.encode('utf8') readable_article = readable_article.replace(' ', '') cur_list = readable_article.replace('</p>', '\n').split('\n') for item in cur_list: if '<img' in item and 'src=' in item: #print item.split('src=')[1].split('"')[1] dom = soupparser.fromstring(item) if len(dom) > 0: img_path = dom[0].xpath('.//img') for img in img_path: rt_result.append(['0', img.get('src')]) else: use_item = dr.sub('', item).replace(' ', '') if len(use_item) > 10: rt_result.append(['1', use_item]) return cur_title, rt_result
def download_html_as_text(url, filename=None, format_to='rst'): """Download HTML content from url and convert it to plain text.""" # Construct internet connection headers = {'User-Agent' : 'Mozilla Firefox for Ubuntu canonical - 1.0'} req = urllib2.Request(url, headers=headers) con = urllib2.urlopen(req) html = con.read() # Fetch and convert main contents article = Document(html).summary() if len(article) < 1024: article = html article = patch_image_alt(article) title = Document(html).short_title() text = pypandoc.convert(article, format_to, format='html') title_utf8 = title.encode('utf-8') lines_insert = [u'\n\n', u'='*len(title_utf8), u'\n', title_utf8, u'\n', u'='*len(title_utf8), u'\n\n', u':URL: ' + url, u'\n\n'] title = title.split('|,-')[0] # Search for urls of images imgurl_pattern = '\.\.\s+\|([^|]+)\|\s+image::\s+(https?://\S+)' imgurl_re = re.compile(imgurl_pattern, re.I) image_urls = imgurl_re.findall(text) if filename is None: filename = title.split('-')[0].strip().replace(' ', '-') txtfile = open(filename + '-bak.' + format_to, 'w') txtfile.writelines(lines_insert) txtfile.write(text.encode('utf-8')) txtfile.close() # Replace online image URLs with local paths. images = download_images(image_urls, filename + '-images') for img, link in images: text = text.replace(link, img) txtfile = open(filename + '.' + format_to, 'w') txtfile.writelines(lines_insert) txtfile.write(text.encode('utf-8')) txtfile.close()
def download_html_as_text(url, filename=None, format_to='rst'): """Download HTML content from url and convert it to plain text.""" # Construct internet connection headers = {'User-Agent': 'Mozilla Firefox for Ubuntu canonical - 1.0'} req = urllib2.Request(url, headers=headers) con = urllib2.urlopen(req) html = con.read() # Fetch and convert main contents article = Document(html).summary() if len(article) < 1024: article = html article = patch_image_alt(article) title = Document(html).short_title() text = pypandoc.convert(article, format_to, format='html') title_utf8 = title.encode('utf-8') lines_insert = [ u'\n\n', u'=' * len(title_utf8), u'\n', title_utf8, u'\n', u'=' * len(title_utf8), u'\n\n', u':URL: ' + url, u'\n\n' ] title = title.split('|,-')[0] # Search for urls of images imgurl_pattern = '\.\.\s+\|([^|]+)\|\s+image::\s+(https?://\S+)' imgurl_re = re.compile(imgurl_pattern, re.I) image_urls = imgurl_re.findall(text) if filename is None: filename = title.split('-')[0].strip().replace(' ', '-') txtfile = open(filename + '-bak.' + format_to, 'w') txtfile.writelines(lines_insert) txtfile.write(text.encode('utf-8')) txtfile.close() # Replace online image URLs with local paths. images = download_images(image_urls, filename + '-images') for img, link in images: text = text.replace(link, img) txtfile = open(filename + '.' + format_to, 'w') txtfile.writelines(lines_insert) txtfile.write(text.encode('utf-8')) txtfile.close()
def extract(self, doc): from readability.readability import Document from bs4 import BeautifulSoup try: if 'html' in doc: html = doc['html'] readable = Document( html, recallPriority=self.recall_priority).summary( html_partial=self.html_partial) cleantext = BeautifulSoup(readable.encode('utf-8'), 'lxml').strings readability_text = ' '.join(cleantext) return readability_text else: return '' except Exception, e: print 'Error in extracting readability %s' % e return ''
def extract(self, html_content, options=None): if options: self.__parse_options(options) from readability.readability import Document from bs4 import BeautifulSoup try: if html_content: readable = Document( html_content, recallPriority=self.recall_priority).summary( html_partial=self.html_partial) cleantext = BeautifulSoup(readable.encode('utf-8'), 'lxml').strings readability_text = ' '.join(cleantext) return {'text': readability_text} else: return None except Exception, e: print 'Error in extracting readability %s' % e return None
def getReadableArticle(url): now = datetime.datetime.now() resp = ["", "", "", "", "", ""] images = [] html = br.open(url).read() readable_article = Document(html).summary() #print readable_article #print readable_article readable_title = Document(html).title() #print readable_title soup = BeautifulSoup(readable_article) final_article = soup.text #print final_article #print final_article links = soup.findAll('img', src=True) for lin in links: li = urlparse.urljoin(url, lin['src']) #print li images.append(li) resp[0] = str(final_article.encode("ascii", "ignore")) #print resp[0] resp[1] = str(readable_title.encode("ascii", "ignore")) resp[2] = str(now.month) + " " + str(now.day) + " " + str( now.year) + "-" + str(now.hour) + ":" + str(now.minute) + ":" + str( now.second) resp[3] = url resp[4] = url #if len(images)>0: #resp[5] = images[0] #else: resp[5] = "" insertDB(resp) print "inserted resp" title_article = [] title_article.append(final_article) title_article.append(readable_title) title_article.append(images) return title_article
def getReadableArticle(url): now = datetime.datetime.now() resp = ["","","","","",""] images = [] html = br.open(url).read() readable_article = Document(html).summary() #print readable_article #print readable_article readable_title = Document(html).title() #print readable_title soup = BeautifulSoup(readable_article) final_article = soup.text #print final_article #print final_article links = soup.findAll('img', src=True) for lin in links: li = urlparse.urljoin(url,lin['src']) #print li images.append( li) resp[0] = str(final_article.encode("ascii","ignore")) #print resp[0] resp[1] = str(readable_title.encode("ascii","ignore")) resp[2] = str(now.month)+" "+str(now.day)+" "+str(now.year)+"-"+str(now.hour)+":"+str(now.minute)+":"+str(now.second) resp[3] = url resp[4] = url #if len(images)>0: #resp[5] = images[0] #else: resp[5] = "" insertDB(resp) print "inserted resp" title_article = [] title_article.append(final_article) title_article.append(readable_title) title_article.append(images) return title_article
def startanalyse(region, company, keyword, count): print '\nModule 3 - analyse html pages to judge keyword related or not.' searchkey = '%s+%s'%(company,keyword) #file for saving analyzing results txtfilename = region+os.sep+company+os.sep+'%s_result.txt'%searchkey txtfile = open(txtfilename,'r') txtcont = txtfile.readlines() txtfile.close() #meta html page file name _htmlfilename = region+os.sep+company+os.sep+searchkey+'_%d.html' yes = 0 no = 0 #pattern: description, keywords, title pattern_title = '<title>(.*?)</title>' pattern_key = '<meta\s(name=["]?keywords["]?\scontent=\"(.*?)\"|content=\"(.*?)\"\sname=["]?keywords["]?).*?>' #.*?>: not always end symbol & space character pattern_des = '<meta\s(name=["]?description["]?\scontent=\"(.*?)\"|content=\"(.*?)\"\sname=["]?description["]?).*?>' #.*?>: not always end symbol & space character txtlist = [] tmpfilename = 'tmp.txt' #temp usage for i in range(count): tmp = i + 1 htmlfilename = _htmlfilename%tmp company_flag = False keyword_flag = False #judge html file is NULL or not file_size = os.stat(htmlfilename).st_size if file_size != 0: htmlfile = open(htmlfilename, 'r') htmlcontent = htmlfile.read() htmlfile.close() #1 - head content: description, keywords, title head_title = re.search(pattern_title,htmlcontent,re.I | re.S) head_key = re.search(pattern_key,htmlcontent,re.I | re.S) head_des = re.search(pattern_des,htmlcontent,re.I | re.S) #2 - body content: readability body_content = Document(htmlcontent).summary() tmpfile = open(tmpfilename,'w') tmpfile.write(body_content.encode('utf-8')) tmpfile.close() tmpfile = open(tmpfilename,'r') body_content = tmpfile.read() tmpfile.close() #is company related or not? if (head_title!=None and (company in head_title.group(1))) or (head_key!=None and (company in head_key.group(1))) or (head_des!=None and (company in head_des.group(1))): company_flag = True else: _company = unicode(company,'mbcs') if _company in body_content: company_flag = True #if company not, stop judging if company_flag: #is keyword related or not? if (head_title!=None and (keyword in head_title.group(1))) or (head_key!=None and (keyword in head_key.group(1))) or (head_des!=None and (keyword in head_des.group(1))): keyword_flag = True else: _keyword = unicode(keyword, 'mbcs') if _keyword in body_content: keyword_flag = True #show results print tmp,' company related:',company_flag,' keyword related:',keyword_flag #store results if company_flag and keyword_flag: txtlist.append('yes') else: txtlist.append('no') i += 1 #write back to analyzing result file for j in range(len(txtcont)): newcont = '*'+txtlist[j]+'\n' oldcont = txtcont[j] txtcont[j] = oldcont.replace('\n', newcont) txtfile = open(txtfilename,'w') txtfile.writelines(txtcont) txtfile.close() if os.path.exists(tmpfilename)==True: os.remove(tmpfilename)
import os, re import requests import pdfkit import json from bs4 import BeautifulSoup, NavigableString, Tag from apiclient.discovery import build from readability.readability import Document import urllib blogURL = 'http://katelynnow.com/riding-solo/' r = requests.get(blogURL) html = r.text readable_article = Document(html).summary() readable_title = Document(html).short_title() with open('test.html', 'wb') as f: f.write(readable_article.encode('utf8')) pdfkit.from_string(readable_title + readable_article, 'out.pdf') os.chdir('/Users/mrswhitneybell/Documents/Jason/J4') def blog2pdf(blogURL): service = build('blogger', 'v3', developerKey = 'AIzaSyAMtRVlEQPjdxvESWqjocPE42D9s1eFlRM') blogs = service.blogs() request = blogs.getByUrl(url = blogURL, view = 'READER') blogInfo = request.execute() blogId = blogInfo['id'] posts = service.posts() request = posts.list(blogId = blogId, status = 'live', orderBy = 'published', fetchImages = True, view = 'READER')
def parse(self, response): sel = Selector(response) item = PostItem() # fill page url item['url'] = response.url # extract page title def match_title(title): if title is None: return False for keyword in self.title_keywords: regex = re.compile(".*%s.*" % keyword) if not regex.match(title): return False return True for tag in ("h1", "h2", "h3", "h4", "title", "strong", "b", "p", "span"): for heads in sel.xpath("//%s/text()" % tag).extract(): #for head in heads.strip().encode('utf-8').split(" - "): for head in filter(None, self.head_seps.split( heads.encode('utf-8'))): if match_title(head): item['title'] = head.strip() break # clean page content html = sel.xpath("//html").extract() if html: content = Document(html[0]).summary() item['page_content'] = content.encode('utf-8') #print item['page_content'] if item.get('title') is None: print "title not found in this page" return if item.get('page_content') is None: print "content not found in this page" return #text = HtmlTool.text(html[0]).encode('utf-8') text = HtmlTool.text(content).encode('utf-8') lines = filter(None, self.line_seps.split(text)) # try to extract project name from title res = self.project_name_exp.match(item['title']) if res: item["project_name"] = res.groups()[1] # project pollutions item["pollutions"] = {} # extract other fields from page content post_lapse_time = None self.hinting_results = {} # dates occuring in page content self.dates = [] for line in lines: def extract_field(field): exps = self.field_regexps[field].get("extract", []) for exp in exps: result = exp[0].match(line) if result: try: return result.groups()[exp[1]] except: pass def hintextract_field(field): if field in self.hintings: exps = self.field_regexps[field].get("hintextract", []) for exp in exps: result = exp[0].match(line) if result: try: return result.groups()[exp[1]] except: pass def set_field(field): def set_extract_field(field): extract_res = extract_field(field) if extract_res: item[field] = extract_res self.hinting_results[field] = False return True def set_hintextract_field(field): hintextract_res = hintextract_field(field) if hintextract_res: item[field] = hintextract_res self.hinting_results[field] = True return True if not item.get(field): if set_extract_field(field): return True else: return set_hintextract_field(field) elif self.hinting_results.get(field): set_extract_field(field) return True else: return True def append_field(field): exps = self.field_regexps[field].get("appending", []) for exp in exps: if exp[0].match(line): item[field][exp[1].encode('utf-8')] = 1 def hinting_fields(fields): for field in fields: exps = self.field_regexps[field].get("hinting", []) for exp in exps: if exp.match(line): self.hintings = {} for field in fields: self.hintings[field] = True break # set hinting field hinting_fields(["project_address"]) hinting_fields(["builder_name", "builder_address"]) hinting_fields(["eia_name", "eia_address"]) # set item fields if set_field("project_name"): set_field("project_address") set_field("project_investment") if set_field("builder_name"): set_field("builder_address") #print self.hintings if set_field("eia_name"): set_field("eia_address") if set_field("start_date"): res = self.date_regexp.match(line) if res: digits = res.groups() try: self.dates.append( WorkDay(Digit(digits[0]), Digit(digits[2]), Digit(digits[4]))) except: pass # append fields append_field("pollutions") # extract fields if post_lapse_time is None: lapse = extract_field("lapse_time") if lapse: post_lapse_time = Digit(lapse) # sort all dates occuring in content # and the first date must be start date of this project # if there is no explicit lapse time we can use the last date as end date of this project self.dates = sorted(self.dates) if self.dates: item["post_start_date"] = str(self.dates[0] + 0) if self.dates and post_lapse_time: item["post_end_date"] = str(self.dates[0].within(post_lapse_time)) elif len(self.dates) > 1: item["post_end_date"] = str(self.dates[-1] + 0) if item.get("project_investment"): investment = Digit(item["project_investment"]) item["project_investment"] = investment if investment > 0 else None print "网页标题:", item.get('title') print "网页链接:", item.get('url') print "项目名称:", item.get("project_name") print "项目地址:", item.get("project_address") print "项目投资:", item.get("project_investment") print "污染类型:", "、".join(item.get("pollutions").keys()) print "建设单位:", item.get("builder_name") print "单位地址:", item.get("builder_address") print "环评单位:", item.get("eia_name") print "单位地址:", item.get("eia_address") print "公告时间:", item.get("post_start_date"), "-", item.get( "post_end_date") item["page_content"] = None yield item
def parse(self, response): sel = Selector(response) item = PostItem() # fill page url item['url'] = response.url # extract page title def match_title(title): if title is None: return False for keyword in self.title_keywords: regex = re.compile(".*%s.*" %keyword) if not regex.match(title): return False return True for tag in ("h1", "h2", "h3", "h4", "title", "strong", "b", "p", "span"): for heads in sel.xpath("//%s/text()" %tag).extract(): #for head in heads.strip().encode('utf-8').split(" - "): for head in filter(None, self.head_seps.split(heads.encode('utf-8'))): if match_title(head): item['title'] = head.strip() break # clean page content html = sel.xpath("//html").extract() if html: content = Document(html[0]).summary() item['page_content'] = content.encode('utf-8') #print item['page_content'] if item.get('title') is None: print "title not found in this page" return if item.get('page_content') is None: print "content not found in this page" return #text = HtmlTool.text(html[0]).encode('utf-8') text = HtmlTool.text(content).encode('utf-8') lines = filter(None, self.line_seps.split(text)) # try to extract project name from title res = self.project_name_exp.match(item['title']) if res: item["project_name"] = res.groups()[1] # project pollutions item["pollutions"] = {} # extract other fields from page content post_lapse_time = None self.hinting_results = {} # dates occuring in page content self.dates = [] for line in lines: def extract_field(field): exps = self.field_regexps[field].get("extract", []) for exp in exps: result = exp[0].match(line) if result: try: return result.groups()[exp[1]] except: pass def hintextract_field(field): if field in self.hintings: exps = self.field_regexps[field].get("hintextract", []) for exp in exps: result = exp[0].match(line) if result: try: return result.groups()[exp[1]] except: pass def set_field(field): def set_extract_field(field): extract_res = extract_field(field) if extract_res: item[field] = extract_res self.hinting_results[field] = False return True def set_hintextract_field(field): hintextract_res = hintextract_field(field) if hintextract_res: item[field] = hintextract_res self.hinting_results[field] = True return True if not item.get(field): if set_extract_field(field): return True else: return set_hintextract_field(field) elif self.hinting_results.get(field): set_extract_field(field) return True else: return True def append_field(field): exps = self.field_regexps[field].get("appending", []) for exp in exps: if exp[0].match(line): item[field][exp[1].encode('utf-8')] = 1 def hinting_fields(fields): for field in fields: exps = self.field_regexps[field].get("hinting", []) for exp in exps: if exp.match(line): self.hintings = {} for field in fields: self.hintings[field] = True break # set hinting field hinting_fields(["project_address"]) hinting_fields(["builder_name", "builder_address"]) hinting_fields(["eia_name", "eia_address"]) # set item fields if set_field("project_name"): set_field("project_address") set_field("project_investment") if set_field("builder_name"): set_field("builder_address") #print self.hintings if set_field("eia_name"): set_field("eia_address") if set_field("start_date"): res = self.date_regexp.match(line) if res: digits = res.groups() try: self.dates.append(WorkDay(Digit(digits[0]), Digit(digits[2]), Digit(digits[4]))) except: pass # append fields append_field("pollutions") # extract fields if post_lapse_time is None: lapse = extract_field("lapse_time") if lapse: post_lapse_time = Digit(lapse) # sort all dates occuring in content # and the first date must be start date of this project # if there is no explicit lapse time we can use the last date as end date of this project self.dates = sorted(self.dates) if self.dates: item["post_start_date"] = str(self.dates[0] + 0) if self.dates and post_lapse_time: item["post_end_date"] = str(self.dates[0].within(post_lapse_time)) elif len(self.dates) > 1: item["post_end_date"] = str(self.dates[-1] + 0) if item.get("project_investment"): investment = Digit(item["project_investment"]) item["project_investment"] = investment if investment > 0 else None print "网页标题:", item.get('title') print "网页链接:", item.get('url') print "项目名称:", item.get("project_name") print "项目地址:", item.get("project_address") print "项目投资:", item.get("project_investment") print "污染类型:", "、".join(item.get("pollutions").keys()) print "建设单位:", item.get("builder_name") print "单位地址:", item.get("builder_address") print "环评单位:", item.get("eia_name") print "单位地址:", item.get("eia_address") print "公告时间:", item.get("post_start_date"), "-", item.get("post_end_date") item["page_content"] = None yield item
def startanalyse(region, company, keyword, count): print '\nModule 3 - analyse html pages to judge keyword related or not.' searchkey = '%s+%s' % (company, keyword) #file for saving analyzing results txtfilename = region + os.sep + company + os.sep + '%s_result.txt' % searchkey txtfile = open(txtfilename, 'r') txtcont = txtfile.readlines() txtfile.close() #meta html page file name _htmlfilename = region + os.sep + company + os.sep + searchkey + '_%d.html' yes = 0 no = 0 #pattern: description, keywords, title pattern_title = '<title>(.*?)</title>' pattern_key = '<meta\s(name=["]?keywords["]?\scontent=\"(.*?)\"|content=\"(.*?)\"\sname=["]?keywords["]?).*?>' #.*?>: not always end symbol & space character pattern_des = '<meta\s(name=["]?description["]?\scontent=\"(.*?)\"|content=\"(.*?)\"\sname=["]?description["]?).*?>' #.*?>: not always end symbol & space character txtlist = [] tmpfilename = 'tmp.txt' #temp usage for i in range(count): tmp = i + 1 htmlfilename = _htmlfilename % tmp company_flag = False keyword_flag = False #judge html file is NULL or not file_size = os.stat(htmlfilename).st_size if file_size != 0: htmlfile = open(htmlfilename, 'r') htmlcontent = htmlfile.read() htmlfile.close() #1 - head content: description, keywords, title head_title = re.search(pattern_title, htmlcontent, re.I | re.S) head_key = re.search(pattern_key, htmlcontent, re.I | re.S) head_des = re.search(pattern_des, htmlcontent, re.I | re.S) #2 - body content: readability body_content = Document(htmlcontent).summary() tmpfile = open(tmpfilename, 'w') tmpfile.write(body_content.encode('utf-8')) tmpfile.close() tmpfile = open(tmpfilename, 'r') body_content = tmpfile.read() tmpfile.close() #is company related or not? if (head_title != None and (company in head_title.group(1))) or ( head_key != None and (company in head_key.group(1))) or ( head_des != None and (company in head_des.group(1))): company_flag = True else: _company = unicode(company, 'mbcs') if _company in body_content: company_flag = True #if company not, stop judging if company_flag: #is keyword related or not? if (head_title != None and (keyword in head_title.group(1))) or ( head_key != None and (keyword in head_key.group(1))) or ( head_des != None and (keyword in head_des.group(1))): keyword_flag = True else: _keyword = unicode(keyword, 'mbcs') if _keyword in body_content: keyword_flag = True #show results print tmp, ' company related:', company_flag, ' keyword related:', keyword_flag #store results if company_flag and keyword_flag: txtlist.append('yes') else: txtlist.append('no') i += 1 #write back to analyzing result file for j in range(len(txtcont)): newcont = '*' + txtlist[j] + '\n' oldcont = txtcont[j] txtcont[j] = oldcont.replace('\n', newcont) txtfile = open(txtfilename, 'w') txtfile.writelines(txtcont) txtfile.close() if os.path.exists(tmpfilename) == True: os.remove(tmpfilename)
#Here is my code. # importing file. from readability.readability import Document import urllib2 # Get Users URL URL= "http://arstechnica.com/science/2017/01/texas-slams-fda-with-lawsuit-for-holding-up-imported-execution-drugs/" #req = urllib2.Request(URL) # putting it in to object. print ("url is - " + URL) #URL = URL.strip('\'"') #print ("new url is - " + URL) fURL = urllib2.urlopen(URL) #Making Html file htmlName="decrufted.html" htmlThing = open(htmlName,'w') #htmlThing.write(Document(fURL.read()).summary()) #first i printed on this shall # and than i thought that i could store that in to veriable. and i stored in to string veriable. #print Document(fURL.read()).summary() #making srting object. strHtmlStuff = Document(fURL.read()).summary() #Writing stuff from string object. htmlThing.write(strHtmlStuff.encode('utf8') + '\n') htmlThing.close print "The file name is: " + htmlName # there u go with html file.
#!/usr/bin/env python import urllib.request, urllib.parse, urllib.error import sys from readability.readability import Document url = sys.argv[1] #url = "http://www.space.com/29740-mice-of-mars-rodents-pave-way-to-red-planet.html" html = urllib.request.urlopen(url).read() readable_article = Document(html).summary() readable_title = Document(html).short_title() print(readable_title) print(readable_article.encode('utf-8').strip())