def download_html_as_text(url, filename=None, format_to='rst'): """Download HTML content from url and convert it to plain text.""" # Construct internet connection headers = {'User-Agent' : 'Mozilla Firefox for Ubuntu canonical - 1.0'} req = urllib2.Request(url, headers=headers) con = urllib2.urlopen(req) html = con.read() # Fetch and convert main contents article = Document(html).summary() if len(article) < 1024: article = html article = patch_image_alt(article) title = Document(html).short_title() text = pypandoc.convert(article, format_to, format='html') title_utf8 = title.encode('utf-8') lines_insert = [u'\n\n', u'='*len(title_utf8), u'\n', title_utf8, u'\n', u'='*len(title_utf8), u'\n\n', u':URL: ' + url, u'\n\n'] title = title.split('|,-')[0] # Search for urls of images imgurl_pattern = '\.\.\s+\|([^|]+)\|\s+image::\s+(https?://\S+)' imgurl_re = re.compile(imgurl_pattern, re.I) image_urls = imgurl_re.findall(text) if filename is None: filename = title.split('-')[0].strip().replace(' ', '-') txtfile = open(filename + '-bak.' + format_to, 'w') txtfile.writelines(lines_insert) txtfile.write(text.encode('utf-8')) txtfile.close() # Replace online image URLs with local paths. images = download_images(image_urls, filename + '-images') for img, link in images: text = text.replace(link, img) txtfile = open(filename + '.' + format_to, 'w') txtfile.writelines(lines_insert) txtfile.write(text.encode('utf-8')) txtfile.close()
def download_html_as_text(url, filename=None, format_to='rst'): """Download HTML content from url and convert it to plain text.""" # Construct internet connection headers = {'User-Agent': 'Mozilla Firefox for Ubuntu canonical - 1.0'} req = urllib2.Request(url, headers=headers) con = urllib2.urlopen(req) html = con.read() # Fetch and convert main contents article = Document(html).summary() if len(article) < 1024: article = html article = patch_image_alt(article) title = Document(html).short_title() text = pypandoc.convert(article, format_to, format='html') title_utf8 = title.encode('utf-8') lines_insert = [ u'\n\n', u'=' * len(title_utf8), u'\n', title_utf8, u'\n', u'=' * len(title_utf8), u'\n\n', u':URL: ' + url, u'\n\n' ] title = title.split('|,-')[0] # Search for urls of images imgurl_pattern = '\.\.\s+\|([^|]+)\|\s+image::\s+(https?://\S+)' imgurl_re = re.compile(imgurl_pattern, re.I) image_urls = imgurl_re.findall(text) if filename is None: filename = title.split('-')[0].strip().replace(' ', '-') txtfile = open(filename + '-bak.' + format_to, 'w') txtfile.writelines(lines_insert) txtfile.write(text.encode('utf-8')) txtfile.close() # Replace online image URLs with local paths. images = download_images(image_urls, filename + '-images') for img, link in images: text = text.replace(link, img) txtfile = open(filename + '.' + format_to, 'w') txtfile.writelines(lines_insert) txtfile.write(text.encode('utf-8')) txtfile.close()
def _filter(self, html): unicode_html = UnicodeDammit(html, is_html=True).unicode_markup text = Document(unicode_html).summary() soup = BeautifulSoup(text, 'lxml') text = clean_soup(soup).get_text() if self.min_len > 0: paragraphs = text.split('\n') paragraphs = [par for par in map(lambda x: clean_spaces(x), paragraphs) if len(par) > self.min_len] return self.delimiter.join(paragraphs) else: return clean_spaces(text)
def process_item(self,item,spider): if self.notThisPipeline(spider): return item hxs = HtmlXPathSelector(text=item["raw"]) image=hxs.select("//*[contains(@id, 'cardpic0')]//a//img/@src").extract() if len(image)==0: image="" else: image=image[0] #image_local=image_path+image[0][-20:] #f=open(image_local,'w') #data=ul.urlopen(image).read() #f.write(data) item['image']=image article= Document(item['raw']).summary() item['article']= html2text.html2text(article) title=Document(item['raw']).short_title() title=title.split('_') item['title']=title[0] return item
def parser_content(url): rt_result = [] dr = re.compile(r'<[^>]+>',re.S) html = urllib.urlopen(url).read() readable_article = Document(html).summary().encode('utf8') #print readable_article readable_article = readable_article.replace(' ','') cur_list = readable_article.split('\n') for item in cur_list: if '<img' in item and 'src=' in item: #print item.split('src=')[1].split('"')[1] dom = soupparser.fromstring(item) if len(dom) > 0: img_path = dom[0].xpath('.//img') for img in img_path: rt_result.append(['0',img.get('src')]) else: use_item = dr.sub('',item).replace(' ','') if len(use_item) > 10: rt_result.append(['1',use_item]) return rt_result
words_edited.extend(["BBC", "England", "Britain", "2012", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]) s=set(words_edited) # Call and parse the BBC News feed into a dict. bbc = feedparser.parse("http://feeds.bbci.co.uk/news/uk/rss.xml") entries_parsed = [] # Go through each entry, use readability module to extract the article title (not the same as the feed unfortunately) and use NLTK to parse it for uncommon words & names. Create an array of these keywords. for entry in bbc.entries: entry_parsed = [] html = urllib.urlopen(entry.link).read() article_title = Document(html).short_title() if ("404" not in article_title): entry_parsed.append(article_title) nltk_title = filter(lambda w: not w.lower() in s,article_title.split()) processed_title = [] for word in nltk_title: try: encoded_word = word.encode() processed_word = encoded_word.translate(None, string.punctuation) if len(processed_word) > 1: processed_title.append(processed_word) except Exception,e: print str(e) entry_parsed.append(processed_title) dt = datetime.fromtimestamp(mktime(entry.published_parsed)) entry_parsed.append(dt) entry_parsed.append(entry.link.encode()) entries_parsed.append(entry_parsed)
s = set(words_edited) # Call and parse the BBC News feed into a dict. bbc = feedparser.parse("http://feeds.bbci.co.uk/news/uk/rss.xml") entries_parsed = [] # Go through each entry, use readability module to extract the article title (not the same as the feed unfortunately) and use NLTK to parse it for uncommon words & names. Create an array of these keywords. for entry in bbc.entries: entry_parsed = [] html = urllib.urlopen(entry.link).read() article_title = Document(html).short_title() if ("404" not in article_title): entry_parsed.append(article_title) nltk_title = filter(lambda w: not w.lower() in s, article_title.split()) processed_title = [] for word in nltk_title: try: encoded_word = word.encode() processed_word = encoded_word.translate( None, string.punctuation) if len(processed_word) > 1: processed_title.append(processed_word) except Exception, e: print str(e) entry_parsed.append(processed_title) dt = datetime.fromtimestamp(mktime(entry.published_parsed)) entry_parsed.append(dt) entry_parsed.append(entry.link.encode()) entries_parsed.append(entry_parsed)
class Summarize(object): """docstring for summarize""" def __init__(self): self.freq = {} self.sentences = [] self.data = '' self.maxRec = 500 def checkSentence(self,s,x): if len(s)>50: return False for word in self.freq[:x]: if not (word[0] in s): return False return True def summarize(self,url): self.data = urllib2.urlopen(url).read() self.data = Document(self.data).summary() self.data = MLStripper.strip_tags(self.data).replace('\n', ' ').replace(',', ' ').replace('\t', ' ').replace("'", "").replace('"', ' ').replace('(',' ').replace(')', ' ').replace(':', ' ').replace(']', ' ').replace('[', ' ').replace(';', ' ') self.data = self.data.lower() temp = self.data.split('.') text = re.findall(r'([a-z]+|\d+)+', self.data) for t in temp: self.sentences += [' '.join(re.findall(r'([a-z]+|\d+)+', t))] self.freq = {} for word in text: if word in self.freq: self.freq[word] += 1 else: self.freq[word] = 1 self.freq = sorted(self.freq.iteritems(), key=operator.itemgetter(1)) self.freq.reverse() t = lxml.html.parse(url) title = t.find(".//title").text return {'title': title, 'summary': self.evaluate(0.01)} def summarizeText(self, text): self.data = MLStripper.strip_tags(text).replace('\n', ' ').replace(',', ' ').replace('\t', ' ').replace("'", "").replace('"', ' ').replace('(',' ').replace(')', ' ').replace(':', ' ').replace(']', ' ').replace('[', ' ').replace(';', ' ') self.data = self.data.lower() temp = self.data.split('.') for t in temp: self.sentences += [' '.join(re.findall(r'([a-z]+|\d+)+', t))] self.freq = {} for word in text: if word in self.freq: self.freq[word] += 1 else: self.freq[word] = 1 self.freq = sorted(self.freq.iteritems(), key=operator.itemgetter(1)) self.freq.reverse() return self.evaluate(0.01) def evaluate(self,d): self.maxRec -= 1 output = '' num = len(self.freq) num = int(math.floor(num*d)) for sentence in self.sentences: s = re.findall(r'[a-z]+', sentence) if self.checkSentence(s, num) == True and len(sentence) > 2: output += sentence[0].upper()+sentence[1:]+'. ' if len(self.data)>0: compression = 1-(len(output)/len(self.data)) if self.maxRec >0: if compression >= 0.80: return self.evaluate(d-.001) if compression <= 0.60: return self.evaluate(d+.001) return output #s = Summarize() #print s.summarize('http://www.bbc.co.uk/news/uk-25996176')