예제 #1
0
파일: plain-notes.py 프로젝트: fredqi/notes
def download_html_as_text(url, filename=None, format_to='rst'):
    """Download HTML content from url and convert it to plain text."""
    # Construct internet connection
    headers = {'User-Agent' : 'Mozilla Firefox for Ubuntu canonical - 1.0'}
    req = urllib2.Request(url, headers=headers)
    con = urllib2.urlopen(req)
    html = con.read()

    # Fetch and convert main contents
    article = Document(html).summary()
    if len(article) < 1024:
        article = html

    article = patch_image_alt(article)
    title = Document(html).short_title()
    text = pypandoc.convert(article, format_to, format='html')

    title_utf8 = title.encode('utf-8')
    lines_insert = [u'\n\n',
                    u'='*len(title_utf8), u'\n',
                    title_utf8, u'\n',
                    u'='*len(title_utf8), u'\n\n', 
                    u':URL: ' + url,  u'\n\n']
    title = title.split('|,-')[0]

    # Search for urls of images
    imgurl_pattern = '\.\.\s+\|([^|]+)\|\s+image::\s+(https?://\S+)'
    imgurl_re = re.compile(imgurl_pattern, re.I)
    image_urls = imgurl_re.findall(text)

    if filename is None:
        filename = title.split('-')[0].strip().replace(' ', '-')

    txtfile = open(filename + '-bak.' + format_to, 'w')
    txtfile.writelines(lines_insert)
    txtfile.write(text.encode('utf-8'))
    txtfile.close()

    # Replace online image URLs with local paths.
    images = download_images(image_urls, filename + '-images')
    for img, link in images:
        text = text.replace(link, img)

    txtfile = open(filename + '.' + format_to, 'w')
    txtfile.writelines(lines_insert)
    txtfile.write(text.encode('utf-8'))
    txtfile.close()
예제 #2
0
def download_html_as_text(url, filename=None, format_to='rst'):
    """Download HTML content from url and convert it to plain text."""
    # Construct internet connection
    headers = {'User-Agent': 'Mozilla Firefox for Ubuntu canonical - 1.0'}
    req = urllib2.Request(url, headers=headers)
    con = urllib2.urlopen(req)
    html = con.read()

    # Fetch and convert main contents
    article = Document(html).summary()
    if len(article) < 1024:
        article = html

    article = patch_image_alt(article)
    title = Document(html).short_title()
    text = pypandoc.convert(article, format_to, format='html')

    title_utf8 = title.encode('utf-8')
    lines_insert = [
        u'\n\n', u'=' * len(title_utf8), u'\n', title_utf8, u'\n',
        u'=' * len(title_utf8), u'\n\n', u':URL: ' + url, u'\n\n'
    ]
    title = title.split('|,-')[0]

    # Search for urls of images
    imgurl_pattern = '\.\.\s+\|([^|]+)\|\s+image::\s+(https?://\S+)'
    imgurl_re = re.compile(imgurl_pattern, re.I)
    image_urls = imgurl_re.findall(text)

    if filename is None:
        filename = title.split('-')[0].strip().replace(' ', '-')

    txtfile = open(filename + '-bak.' + format_to, 'w')
    txtfile.writelines(lines_insert)
    txtfile.write(text.encode('utf-8'))
    txtfile.close()

    # Replace online image URLs with local paths.
    images = download_images(image_urls, filename + '-images')
    for img, link in images:
        text = text.replace(link, img)

    txtfile = open(filename + '.' + format_to, 'w')
    txtfile.writelines(lines_insert)
    txtfile.write(text.encode('utf-8'))
    txtfile.close()
예제 #3
0
 def _filter(self, html):
     unicode_html = UnicodeDammit(html, is_html=True).unicode_markup
     text = Document(unicode_html).summary()
     soup = BeautifulSoup(text, 'lxml')
     text = clean_soup(soup).get_text()
     if self.min_len > 0:
         paragraphs = text.split('\n')
         paragraphs = [par for par in map(lambda x: clean_spaces(x), paragraphs)
                       if len(par) > self.min_len]
         return self.delimiter.join(paragraphs)
     else:
         return clean_spaces(text)
예제 #4
0
 def process_item(self,item,spider):
     if self.notThisPipeline(spider):
         return item
     hxs = HtmlXPathSelector(text=item["raw"])
     image=hxs.select("//*[contains(@id, 'cardpic0')]//a//img/@src").extract()
     if len(image)==0:
    		image=""
     else:
     	image=image[0]
     	#image_local=image_path+image[0][-20:]
    		#f=open(image_local,'w')
    		#data=ul.urlopen(image).read()
    		#f.write(data)
    	item['image']=image
    	article= Document(item['raw']).summary()
    	item['article']= html2text.html2text(article)
    	title=Document(item['raw']).short_title()
    	title=title.split('_')
    	item['title']=title[0]
     return item
예제 #5
0
def parser_content(url):
    rt_result = []
    dr = re.compile(r'<[^>]+>',re.S)
    html = urllib.urlopen(url).read()
    readable_article = Document(html).summary().encode('utf8')
    #print readable_article
    readable_article = readable_article.replace('&#13;','')
    cur_list = readable_article.split('\n')
    for item in cur_list:
        if '<img' in item and 'src=' in item:
            #print item.split('src=')[1].split('"')[1]
            dom = soupparser.fromstring(item)
            if len(dom) > 0:
                img_path = dom[0].xpath('.//img')
                for img in img_path:
                    rt_result.append(['0',img.get('src')])
        else:
            use_item = dr.sub('',item).replace(' ','')
            if len(use_item) > 10:
                rt_result.append(['1',use_item])
    return rt_result
예제 #6
0
words_edited.extend(["BBC", "England", "Britain", "2012", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])
s=set(words_edited)

# Call and parse the BBC News feed into a dict.
bbc = feedparser.parse("http://feeds.bbci.co.uk/news/uk/rss.xml")

entries_parsed = []

# Go through each entry, use readability module to extract the article title (not the same as the feed unfortunately) and use NLTK to parse it for uncommon words & names. Create an array of these keywords.
for entry in bbc.entries:
	entry_parsed = []
	html = urllib.urlopen(entry.link).read()
	article_title = Document(html).short_title()
	if ("404" not in article_title):
		entry_parsed.append(article_title)
		nltk_title = filter(lambda w: not w.lower() in s,article_title.split())
		processed_title = []
		for word in nltk_title:
			try: 
				encoded_word = word.encode()
				processed_word = encoded_word.translate(None, string.punctuation)
				if len(processed_word) > 1:
					processed_title.append(processed_word)
			except Exception,e:
				print str(e)
		entry_parsed.append(processed_title)
		dt = datetime.fromtimestamp(mktime(entry.published_parsed))
		entry_parsed.append(dt)
		entry_parsed.append(entry.link.encode())
		entries_parsed.append(entry_parsed)
	
예제 #7
0
s = set(words_edited)

# Call and parse the BBC News feed into a dict.
bbc = feedparser.parse("http://feeds.bbci.co.uk/news/uk/rss.xml")

entries_parsed = []

# Go through each entry, use readability module to extract the article title (not the same as the feed unfortunately) and use NLTK to parse it for uncommon words & names. Create an array of these keywords.
for entry in bbc.entries:
    entry_parsed = []
    html = urllib.urlopen(entry.link).read()
    article_title = Document(html).short_title()
    if ("404" not in article_title):
        entry_parsed.append(article_title)
        nltk_title = filter(lambda w: not w.lower() in s,
                            article_title.split())
        processed_title = []
        for word in nltk_title:
            try:
                encoded_word = word.encode()
                processed_word = encoded_word.translate(
                    None, string.punctuation)
                if len(processed_word) > 1:
                    processed_title.append(processed_word)
            except Exception, e:
                print str(e)
        entry_parsed.append(processed_title)
        dt = datetime.fromtimestamp(mktime(entry.published_parsed))
        entry_parsed.append(dt)
        entry_parsed.append(entry.link.encode())
        entries_parsed.append(entry_parsed)
예제 #8
0
class Summarize(object):
	"""docstring for summarize"""
	def __init__(self):		
		self.freq = {}
		self.sentences = []
		self.data = ''
		self.maxRec = 500

	def checkSentence(self,s,x):
		if len(s)>50:
			return False
		for word in self.freq[:x]:
			if not (word[0] in s):
				return False

		return True

	def summarize(self,url):
		self.data = urllib2.urlopen(url).read()
		self.data = Document(self.data).summary()
		
		self.data = MLStripper.strip_tags(self.data).replace('\n', ' ').replace(',', ' ').replace('\t', ' ').replace("'", "").replace('"', ' ').replace('(',' ').replace(')', ' ').replace(':', ' ').replace(']', ' ').replace('[', ' ').replace(';', ' ')
		self.data = self.data.lower()
		temp = self.data.split('.')
		

		text = re.findall(r'([a-z]+|\d+)+', self.data)

		for t in temp:
			self.sentences += [' '.join(re.findall(r'([a-z]+|\d+)+', t))]	

		self.freq = {}

		for word in text:
			if word in self.freq:
				self.freq[word] += 1
			else:
				self.freq[word] = 1

		self.freq = sorted(self.freq.iteritems(), key=operator.itemgetter(1))
		self.freq.reverse()
		t = lxml.html.parse(url)
		title = t.find(".//title").text
		return {'title': title, 'summary': self.evaluate(0.01)}

	def summarizeText(self, text):
		self.data = MLStripper.strip_tags(text).replace('\n', ' ').replace(',', ' ').replace('\t', ' ').replace("'", "").replace('"', ' ').replace('(',' ').replace(')', ' ').replace(':', ' ').replace(']', ' ').replace('[', ' ').replace(';', ' ')
		self.data = self.data.lower()
		temp = self.data.split('.')
		for t in temp:
			self.sentences += [' '.join(re.findall(r'([a-z]+|\d+)+', t))]	

		self.freq = {}

		for word in text:
			if word in self.freq:
				self.freq[word] += 1
			else:
				self.freq[word] = 1

		self.freq = sorted(self.freq.iteritems(), key=operator.itemgetter(1))
		self.freq.reverse()
		
		return self.evaluate(0.01)

	def evaluate(self,d):
		self.maxRec -= 1
		output = ''

		num = len(self.freq)
		num = int(math.floor(num*d))

		for sentence in self.sentences:
			s = re.findall(r'[a-z]+', sentence)
			if self.checkSentence(s, num) == True and len(sentence) > 2:
				output += sentence[0].upper()+sentence[1:]+'. '	
		if len(self.data)>0:
			compression = 1-(len(output)/len(self.data))
			if self.maxRec >0:
				if compression >= 0.80:
					return self.evaluate(d-.001)

				if compression <= 0.60:
					return self.evaluate(d+.001)

		return output


#s = Summarize()

#print s.summarize('http://www.bbc.co.uk/news/uk-25996176')