Python Document.encode 예제들, readability.readability.Document.encode Python 예제들

예제 #1

0

파일 보기

파일: checker.py 프로젝트: joubin/PrivacyPolicyChecker

def checkerFunction(myInput):
	today = datetime.date.today()
	try:
		google1 = 'http://www.google.com/search?hl=en&q='
		google2 = '%20privacy%20policy&btnI=1'
		keyword = myInput
		
		url = google1 + keyword + google2
		r = requests.get(url, allow_redirects=False)
		url = r.headers['location']
	except Exception as e:
		return


	
	myFullPath = "./sandbox/db/" + keyword

	if not os.path.exists("./sandbox"):
    	  os.makedirs("./sandbox")

	if not os.path.exists("./sandbox/db/"):
      	  os.makedirs("./sandbox/db/")

	if not os.path.exists(myFullPath):
    	  os.makedirs(myFullPath)

	filename = keyword + "." + str(today)
	filetowrite = myFullPath + "/" + filename
	
	fileExist =  os.path.isfile(filetowrite)
	
	
	
	
	if (url == None):
		return
	html = urllib.urlopen(url).read()
	readable_article = Document(html).summary()
	tempFileMade = False
	originalFileMade = False
	if(fileExist):
		filetowrite = filetowrite + ".tmp."
		f = open(filetowrite, 'w')
		writeThis = str(readable_article.encode('ascii', 'ignore')) 
		f.write(writeThis)
		f.close
		tempFileMade = True
	else:
		f = open(filetowrite, 'w')
		writeThis = str(readable_article.encode('ascii', 'ignore'))
		f.write(writeThis)
		f.close
		originalFileMade = True
	
	hashedmd5 = hashlib.md5(readable_article.encode('ascii', 'ignore'))
	hashedArticle = hashedmd5.hexdigest()
	return hashedArticle

예제 #2

0

파일 보기

파일: getAnnouncement.py 프로젝트: lukharri/Web-Scraping

def get_announcement_body(url):

        now = datetime.datetime.now()
        resp = ["","","","","",""]
        images = []
        html = br.open(url).read()

        readable_announcement = Document(html).summary()
        readable_title = Document(html).title()
        soup = BeautifulSoup(readable_announcement, "lxml")
        final_announcement = soup.text
        links = soup.findAll('img', src=True)
        for lin in links:
                li = urlparse.urljoin(url,lin['src'])
                images.append( li)
                
        resp[0] = str(final_announcement.encode("ascii","ignore"))
        resp[1] = str(readable_title.encode("ascii","ignore"))
        resp[2] = str(now.month)+" "+str(now.day)+" "+str(now.year)+"-"+str(now.hour)+":"+str(now.minute)+":"+str(now.second)
        resp[3] = url
        resp[4] = url
        resp[5] = ""
        #insertDB(resp)
        #print "inserted resp"
                 
        title_article = []
        title_article.append(final_announcement)
        title_article.append(readable_title)
        title_article.append(images)                
        return title_article

예제 #3

0

파일 보기

def extrat_html_document(url):
    try:
        print "extrat_html_document"
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = {'User-Agent': user_agent}
        r = urllib2.Request(url, headers=headers)
        socket = urllib2.urlopen(r, timeout=1)
        url = socket.geturl()
        html = socket.read()

        #block_url pass
        for bl_url in block_url:
            if len(url.split(bl_url)) > 1:
                summary = "block"
                return summary

        for ext_url in exception_url:
            if len(url.split(ext_url)) > 1:
                readable_title = Document(html).short_title()
                summary = readable_title.encode('utf-8')
                _file.write(summary + '\n')
                return summary

        readable_article = Document(html).summary()
        readable_title = Document(html).short_title()
        summary = readable_title.encode('utf-8') + readable_title.encode(
            'utf-8')
        print "soup start"
        soup = BeautifulSoup(readable_article.replace("br/", "p"),
                             "html.parser")
        print "summary:"

        for s in soup("p"):
            summary += str(s.encode('utf-8'))


#        summary += readable_article.encode('utf-8')

    except Exception:
        _file.write('extrat_html_document Failed URL : ' + url + '\n')
        summary = "Failed Get data"

    return summary

예제 #4

0

파일 보기

파일: views.py 프로젝트: yoonwonsang/redcabinet

def extrat_html_document(url):
    try :
        print "extrat_html_document"
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent }
        r = urllib2.Request(url, headers=headers)
        socket = urllib2.urlopen(r,timeout = 1)
        url = socket.geturl()
        html = socket.read()

        #block_url pass
        for bl_url in block_url:
            if len(url.split(bl_url)) > 1:
                summary="block"
                return summary

        for ext_url in exception_url:
            if len(url.split(ext_url)) > 1:
                readable_title = Document(html).short_title()
                summary = readable_title.encode('utf-8')
                _file.write(summary+'\n')
                return summary

        readable_article = Document(html).summary()
        readable_title = Document(html).short_title()
        summary = readable_title.encode('utf-8') + readable_title.encode('utf-8')
        print "soup start"
        soup = BeautifulSoup(readable_article.replace("br/","p"),"html.parser")
        print "summary:"

        for s in soup("p"):
            summary += str(s.encode('utf-8'))

#        summary += readable_article.encode('utf-8')


    except Exception:
        _file.write('extrat_html_document Failed URL : ' + url + '\n')
        summary = "Failed Get data"

    return summary

예제 #5

0

파일 보기

파일: extract_lib.py 프로젝트: simon582/Antman

 def get_content(self, url):
     rt_result = []
     dr = re.compile(r'<[^>]+>', re.S)
     html = urllib.urlopen(url).read()
     cur_title = Document(html).short_title().replace(' ', '')
     readable_article = Document(html).summary()
     print readable_article.encode('utf8')
     readable_article = readable_article.replace('&#13;', '')
     cur_list = readable_article.replace('</p>', '\n').split('\n')
     for item in cur_list:
         if '<img' in item and 'src=' in item:
             #print item.split('src=')[1].split('"')[1]
             dom = soupparser.fromstring(item)
             if len(dom) > 0:
                 img_path = dom[0].xpath('.//img')
                 for img in img_path:
                     rt_result.append(['0', img.get('src')])
         else:
             use_item = dr.sub('', item).replace(' ', '')
             if len(use_item) > 10:
                 rt_result.append(['1', use_item])
     return cur_title, rt_result

예제 #6

0

파일 보기

파일: plain-notes.py 프로젝트: fredqi/notes

def download_html_as_text(url, filename=None, format_to='rst'):
    """Download HTML content from url and convert it to plain text."""
    # Construct internet connection
    headers = {'User-Agent' : 'Mozilla Firefox for Ubuntu canonical - 1.0'}
    req = urllib2.Request(url, headers=headers)
    con = urllib2.urlopen(req)
    html = con.read()

    # Fetch and convert main contents
    article = Document(html).summary()
    if len(article) < 1024:
        article = html

    article = patch_image_alt(article)
    title = Document(html).short_title()
    text = pypandoc.convert(article, format_to, format='html')

    title_utf8 = title.encode('utf-8')
    lines_insert = [u'\n\n',
                    u'='*len(title_utf8), u'\n',
                    title_utf8, u'\n',
                    u'='*len(title_utf8), u'\n\n', 
                    u':URL: ' + url,  u'\n\n']
    title = title.split('|,-')[0]

    # Search for urls of images
    imgurl_pattern = '\.\.\s+\|([^|]+)\|\s+image::\s+(https?://\S+)'
    imgurl_re = re.compile(imgurl_pattern, re.I)
    image_urls = imgurl_re.findall(text)

    if filename is None:
        filename = title.split('-')[0].strip().replace(' ', '-')

    txtfile = open(filename + '-bak.' + format_to, 'w')
    txtfile.writelines(lines_insert)
    txtfile.write(text.encode('utf-8'))
    txtfile.close()

    # Replace online image URLs with local paths.
    images = download_images(image_urls, filename + '-images')
    for img, link in images:
        text = text.replace(link, img)

    txtfile = open(filename + '.' + format_to, 'w')
    txtfile.writelines(lines_insert)
    txtfile.write(text.encode('utf-8'))
    txtfile.close()

예제 #7

0

파일 보기

def download_html_as_text(url, filename=None, format_to='rst'):
    """Download HTML content from url and convert it to plain text."""
    # Construct internet connection
    headers = {'User-Agent': 'Mozilla Firefox for Ubuntu canonical - 1.0'}
    req = urllib2.Request(url, headers=headers)
    con = urllib2.urlopen(req)
    html = con.read()

    # Fetch and convert main contents
    article = Document(html).summary()
    if len(article) < 1024:
        article = html

    article = patch_image_alt(article)
    title = Document(html).short_title()
    text = pypandoc.convert(article, format_to, format='html')

    title_utf8 = title.encode('utf-8')
    lines_insert = [
        u'\n\n', u'=' * len(title_utf8), u'\n', title_utf8, u'\n',
        u'=' * len(title_utf8), u'\n\n', u':URL: ' + url, u'\n\n'
    ]
    title = title.split('|,-')[0]

    # Search for urls of images
    imgurl_pattern = '\.\.\s+\|([^|]+)\|\s+image::\s+(https?://\S+)'
    imgurl_re = re.compile(imgurl_pattern, re.I)
    image_urls = imgurl_re.findall(text)

    if filename is None:
        filename = title.split('-')[0].strip().replace(' ', '-')

    txtfile = open(filename + '-bak.' + format_to, 'w')
    txtfile.writelines(lines_insert)
    txtfile.write(text.encode('utf-8'))
    txtfile.close()

    # Replace online image URLs with local paths.
    images = download_images(image_urls, filename + '-images')
    for img, link in images:
        text = text.replace(link, img)

    txtfile = open(filename + '.' + format_to, 'w')
    txtfile.writelines(lines_insert)
    txtfile.write(text.encode('utf-8'))
    txtfile.close()

예제 #8

0

파일 보기

 def extract(self, doc):
     from readability.readability import Document
     from bs4 import BeautifulSoup
     try:
         if 'html' in doc:
             html = doc['html']
             readable = Document(
                 html, recallPriority=self.recall_priority).summary(
                     html_partial=self.html_partial)
             cleantext = BeautifulSoup(readable.encode('utf-8'),
                                       'lxml').strings
             readability_text = ' '.join(cleantext)
             return readability_text
         else:
             return ''
     except Exception, e:
         print 'Error in extracting readability %s' % e
         return ''

예제 #9

0

파일 보기

 def extract(self, html_content, options=None):
     if options:
         self.__parse_options(options)
     from readability.readability import Document
     from bs4 import BeautifulSoup
     try:
         if html_content:
             readable = Document(
                 html_content, recallPriority=self.recall_priority).summary(
                     html_partial=self.html_partial)
             cleantext = BeautifulSoup(readable.encode('utf-8'),
                                       'lxml').strings
             readability_text = ' '.join(cleantext)
             return {'text': readability_text}
         else:
             return None
     except Exception, e:
         print 'Error in extracting readability %s' % e
         return None

예제 #10

0

파일 보기

파일: getarticle.py 프로젝트: whatarthurcodes/Web_Scraping

def getReadableArticle(url):

    now = datetime.datetime.now()
    resp = ["", "", "", "", "", ""]
    images = []
    html = br.open(url).read()

    readable_article = Document(html).summary()
    #print readable_article
    #print readable_article
    readable_title = Document(html).title()
    #print readable_title
    soup = BeautifulSoup(readable_article)
    final_article = soup.text
    #print final_article
    #print final_article
    links = soup.findAll('img', src=True)
    for lin in links:
        li = urlparse.urljoin(url, lin['src'])
        #print li
        images.append(li)

    resp[0] = str(final_article.encode("ascii", "ignore"))
    #print resp[0]
    resp[1] = str(readable_title.encode("ascii", "ignore"))
    resp[2] = str(now.month) + " " + str(now.day) + " " + str(
        now.year) + "-" + str(now.hour) + ":" + str(now.minute) + ":" + str(
            now.second)
    resp[3] = url
    resp[4] = url
    #if len(images)>0:
    #resp[5] = images[0]
    #else:
    resp[5] = ""
    insertDB(resp)
    print "inserted resp"

    title_article = []
    title_article.append(final_article)
    title_article.append(readable_title)
    title_article.append(images)
    return title_article

예제 #11

0

파일 보기

파일: webscraper.py 프로젝트: rrmckeever0319/Python-Code

def getReadableArticle(url):

        now = datetime.datetime.now()
        resp = ["","","","","",""]
        images = []
        html = br.open(url).read()

        readable_article = Document(html).summary()
        #print readable_article
        #print readable_article
        readable_title = Document(html).title()
        #print readable_title
        soup = BeautifulSoup(readable_article)
        final_article = soup.text
        #print final_article
        #print final_article
        links = soup.findAll('img', src=True)
        for lin in links:
                li = urlparse.urljoin(url,lin['src'])
                #print li
                images.append( li)
                
        resp[0] = str(final_article.encode("ascii","ignore"))
        #print resp[0]
        resp[1] = str(readable_title.encode("ascii","ignore"))
        resp[2] = str(now.month)+" "+str(now.day)+" "+str(now.year)+"-"+str(now.hour)+":"+str(now.minute)+":"+str(now.second)
        resp[3] = url
        resp[4] = url
        #if len(images)>0:
                #resp[5] = images[0]
        #else:
        resp[5] = ""
        insertDB(resp)
        print "inserted resp"
                 

        title_article = []
        title_article.append(final_article)
        title_article.append(readable_title)
        title_article.append(images)                
        return title_article

예제 #12

0

파일 보기

파일: analyse.py 프로젝트: holybin/htmlparser

def startanalyse(region, company, keyword, count):
    print '\nModule 3 - analyse html pages to judge keyword related or not.'

    searchkey = '%s+%s'%(company,keyword)

    #file for saving analyzing results
    txtfilename = region+os.sep+company+os.sep+'%s_result.txt'%searchkey
    txtfile = open(txtfilename,'r')
    txtcont = txtfile.readlines()
    txtfile.close()

    #meta html page file name
    _htmlfilename = region+os.sep+company+os.sep+searchkey+'_%d.html'

    yes = 0
    no = 0

    #pattern: description, keywords, title
    pattern_title = '<title>(.*?)</title>'
    pattern_key = '<meta\s(name=["]?keywords["]?\scontent=\"(.*?)\"|content=\"(.*?)\"\sname=["]?keywords["]?).*?>'   #.*?>: not always end symbol & space character
    pattern_des = '<meta\s(name=["]?description["]?\scontent=\"(.*?)\"|content=\"(.*?)\"\sname=["]?description["]?).*?>'   #.*?>: not always end symbol & space character

    txtlist = []
    tmpfilename = 'tmp.txt' #temp usage
    for i in range(count):
        tmp = i + 1
        htmlfilename = _htmlfilename%tmp

        company_flag = False
        keyword_flag = False
            
        #judge html file is NULL or not
        file_size = os.stat(htmlfilename).st_size
        
        if file_size != 0:
            htmlfile = open(htmlfilename, 'r')
            htmlcontent = htmlfile.read()
            htmlfile.close()

            #1 - head content: description, keywords, title
            head_title = re.search(pattern_title,htmlcontent,re.I | re.S)
            head_key = re.search(pattern_key,htmlcontent,re.I | re.S)
            head_des = re.search(pattern_des,htmlcontent,re.I | re.S)
            #2 - body content: readability
            body_content = Document(htmlcontent).summary()
            tmpfile = open(tmpfilename,'w')
            tmpfile.write(body_content.encode('utf-8'))
            tmpfile.close()
            tmpfile = open(tmpfilename,'r')
            body_content = tmpfile.read()
            tmpfile.close()

            #is company related or not?
            if (head_title!=None and (company in head_title.group(1))) or (head_key!=None and (company in head_key.group(1))) or (head_des!=None and (company in head_des.group(1))):
                company_flag = True
            else:
                _company = unicode(company,'mbcs')
                if _company in body_content:
                    company_flag = True
            #if company not, stop judging
            if company_flag:
                #is keyword related or not?
                if (head_title!=None and (keyword in head_title.group(1))) or (head_key!=None and (keyword in head_key.group(1))) or (head_des!=None and (keyword in head_des.group(1))):
                    keyword_flag = True
                else:
                    _keyword = unicode(keyword, 'mbcs')
                    if _keyword in body_content:
                        keyword_flag = True
        #show results
        print tmp,' company related:',company_flag,' keyword related:',keyword_flag
    
        #store results
        if company_flag and keyword_flag:
            txtlist.append('yes')
        else:
            txtlist.append('no')
        i += 1

    #write back to analyzing result file
    for j in range(len(txtcont)):
        newcont = '*'+txtlist[j]+'\n'
        oldcont = txtcont[j]
        txtcont[j] = oldcont.replace('\n', newcont)
    txtfile = open(txtfilename,'w')
    txtfile.writelines(txtcont)
    txtfile.close()

    if os.path.exists(tmpfilename)==True:
        os.remove(tmpfilename)

예제 #13

0

파일 보기

파일: blogPrint-dev.py 프로젝트: j4group/wolf_is_lame

import os, re
import requests
import pdfkit
import json
from bs4 import BeautifulSoup, NavigableString, Tag
from apiclient.discovery import build
from readability.readability import Document
import urllib
   
blogURL = 'http://katelynnow.com/riding-solo/'
r = requests.get(blogURL)
html = r.text
readable_article = Document(html).summary()
readable_title = Document(html).short_title()
with open('test.html', 'wb') as f:
    f.write(readable_article.encode('utf8'))

pdfkit.from_string(readable_title + readable_article,
                   'out.pdf')

os.chdir('/Users/mrswhitneybell/Documents/Jason/J4')
def blog2pdf(blogURL):
    service = build('blogger', 'v3', 
        developerKey = 'AIzaSyAMtRVlEQPjdxvESWqjocPE42D9s1eFlRM') 
    blogs = service.blogs() 
    request = blogs.getByUrl(url = blogURL, view = 'READER') 
    blogInfo = request.execute()
    blogId = blogInfo['id'] 
    posts = service.posts()
    request = posts.list(blogId = blogId, status = 'live', 
        orderBy = 'published', fetchImages = True, view = 'READER')

예제 #14

0

파일 보기

    def parse(self, response):
        sel = Selector(response)
        item = PostItem()

        # fill page url
        item['url'] = response.url

        # extract page title
        def match_title(title):
            if title is None: return False
            for keyword in self.title_keywords:
                regex = re.compile(".*%s.*" % keyword)
                if not regex.match(title):
                    return False
            return True

        for tag in ("h1", "h2", "h3", "h4", "title", "strong", "b", "p",
                    "span"):
            for heads in sel.xpath("//%s/text()" % tag).extract():
                #for head in heads.strip().encode('utf-8').split(" - "):
                for head in filter(None,
                                   self.head_seps.split(
                                       heads.encode('utf-8'))):
                    if match_title(head):
                        item['title'] = head.strip()
                        break
        # clean page content
        html = sel.xpath("//html").extract()
        if html:
            content = Document(html[0]).summary()
            item['page_content'] = content.encode('utf-8')
            #print item['page_content']
        if item.get('title') is None:
            print "title not found in this page"
            return
        if item.get('page_content') is None:
            print "content not found in this page"
            return

        #text = HtmlTool.text(html[0]).encode('utf-8')
        text = HtmlTool.text(content).encode('utf-8')
        lines = filter(None, self.line_seps.split(text))
        # try to extract project name from title
        res = self.project_name_exp.match(item['title'])
        if res:
            item["project_name"] = res.groups()[1]
        # project pollutions
        item["pollutions"] = {}
        # extract other fields from page content
        post_lapse_time = None
        self.hinting_results = {}
        # dates occuring in page content
        self.dates = []
        for line in lines:

            def extract_field(field):
                exps = self.field_regexps[field].get("extract", [])
                for exp in exps:
                    result = exp[0].match(line)
                    if result:
                        try:
                            return result.groups()[exp[1]]
                        except:
                            pass

            def hintextract_field(field):
                if field in self.hintings:
                    exps = self.field_regexps[field].get("hintextract", [])
                    for exp in exps:
                        result = exp[0].match(line)
                        if result:
                            try:
                                return result.groups()[exp[1]]
                            except:
                                pass

            def set_field(field):
                def set_extract_field(field):
                    extract_res = extract_field(field)
                    if extract_res:
                        item[field] = extract_res
                        self.hinting_results[field] = False
                        return True

                def set_hintextract_field(field):
                    hintextract_res = hintextract_field(field)
                    if hintextract_res:
                        item[field] = hintextract_res
                        self.hinting_results[field] = True
                        return True

                if not item.get(field):
                    if set_extract_field(field):
                        return True
                    else:
                        return set_hintextract_field(field)
                elif self.hinting_results.get(field):
                    set_extract_field(field)
                    return True
                else:
                    return True

            def append_field(field):
                exps = self.field_regexps[field].get("appending", [])
                for exp in exps:
                    if exp[0].match(line):
                        item[field][exp[1].encode('utf-8')] = 1

            def hinting_fields(fields):
                for field in fields:
                    exps = self.field_regexps[field].get("hinting", [])
                    for exp in exps:
                        if exp.match(line):
                            self.hintings = {}
                            for field in fields:
                                self.hintings[field] = True
                            break

            # set hinting field
            hinting_fields(["project_address"])
            hinting_fields(["builder_name", "builder_address"])
            hinting_fields(["eia_name", "eia_address"])
            # set item fields
            if set_field("project_name"):
                set_field("project_address")
            set_field("project_investment")
            if set_field("builder_name"):
                set_field("builder_address")
            #print self.hintings
            if set_field("eia_name"):
                set_field("eia_address")
            if set_field("start_date"):
                res = self.date_regexp.match(line)
                if res:
                    digits = res.groups()
                    try:
                        self.dates.append(
                            WorkDay(Digit(digits[0]), Digit(digits[2]),
                                    Digit(digits[4])))
                    except:
                        pass
            # append fields
            append_field("pollutions")
            # extract fields
            if post_lapse_time is None:
                lapse = extract_field("lapse_time")
                if lapse:
                    post_lapse_time = Digit(lapse)

        # sort all dates occuring in content
        # and the first date must be start date of this project
        # if there is no explicit lapse time we can use the last date as end date of this project
        self.dates = sorted(self.dates)
        if self.dates:
            item["post_start_date"] = str(self.dates[0] + 0)
        if self.dates and post_lapse_time:
            item["post_end_date"] = str(self.dates[0].within(post_lapse_time))
        elif len(self.dates) > 1:
            item["post_end_date"] = str(self.dates[-1] + 0)
        if item.get("project_investment"):
            investment = Digit(item["project_investment"])
            item["project_investment"] = investment if investment > 0 else None
        print "网页标题：", item.get('title')
        print "网页链接：", item.get('url')
        print "项目名称：", item.get("project_name")
        print "项目地址：", item.get("project_address")
        print "项目投资：", item.get("project_investment")
        print "污染类型：", "、".join(item.get("pollutions").keys())
        print "建设单位：", item.get("builder_name")
        print "单位地址：", item.get("builder_address")
        print "环评单位：", item.get("eia_name")
        print "单位地址：", item.get("eia_address")
        print "公告时间：", item.get("post_start_date"), "-", item.get(
            "post_end_date")
        item["page_content"] = None
        yield item

예제 #15

0

파일 보기

파일: post_spider.py 프로젝트: chrox/ecolect

    def parse(self, response):
        sel = Selector(response)
        item = PostItem()

        # fill page url
        item['url'] = response.url
        # extract page title
        def match_title(title):
            if title is None: return False
            for keyword in self.title_keywords:
                regex = re.compile(".*%s.*" %keyword)
                if not regex.match(title):
                    return False
            return True
        for tag in ("h1", "h2", "h3", "h4", "title", "strong", "b", "p", "span"):
            for heads in sel.xpath("//%s/text()" %tag).extract():
                #for head in heads.strip().encode('utf-8').split(" - "):
                for head in filter(None, self.head_seps.split(heads.encode('utf-8'))):
                    if match_title(head):
                        item['title'] = head.strip()
                        break
        # clean page content
        html = sel.xpath("//html").extract()
        if html:
            content = Document(html[0]).summary()
            item['page_content'] = content.encode('utf-8')
            #print item['page_content']
        if item.get('title') is None:
            print "title not found in this page"
            return
        if item.get('page_content') is None:
            print "content not found in this page"
            return

        #text = HtmlTool.text(html[0]).encode('utf-8')
        text = HtmlTool.text(content).encode('utf-8')
        lines = filter(None, self.line_seps.split(text))
        # try to extract project name from title
        res = self.project_name_exp.match(item['title'])
        if res:
            item["project_name"] = res.groups()[1]
        # project pollutions
        item["pollutions"] = {}
        # extract other fields from page content
        post_lapse_time = None
        self.hinting_results = {}
        # dates occuring in page content
        self.dates = []
        for line in lines:
            def extract_field(field):
                exps = self.field_regexps[field].get("extract", [])
                for exp in exps:
                    result = exp[0].match(line)
                    if result:
                        try:
                            return result.groups()[exp[1]]
                        except:
                            pass
            def hintextract_field(field):
                if field in self.hintings:
                    exps = self.field_regexps[field].get("hintextract", [])
                    for exp in exps:
                        result = exp[0].match(line)
                        if result:
                            try:
                                return result.groups()[exp[1]]
                            except:
                                pass
            def set_field(field):
                def set_extract_field(field):
                    extract_res = extract_field(field)
                    if extract_res:
                        item[field] = extract_res
                        self.hinting_results[field] = False
                        return True
                def set_hintextract_field(field):
                    hintextract_res = hintextract_field(field)
                    if hintextract_res:
                        item[field] = hintextract_res
                        self.hinting_results[field] = True
                        return True
                if not item.get(field):
                    if set_extract_field(field):
                        return True
                    else:
                        return set_hintextract_field(field)
                elif self.hinting_results.get(field):
                    set_extract_field(field)
                    return True
                else:
                    return True
            def append_field(field):
                exps = self.field_regexps[field].get("appending", [])
                for exp in exps:
                    if exp[0].match(line):
                        item[field][exp[1].encode('utf-8')] = 1
            def hinting_fields(fields):
                for field in fields:
                    exps = self.field_regexps[field].get("hinting", [])
                    for exp in exps:
                        if exp.match(line):
                            self.hintings = {}
                            for field in fields:
                                self.hintings[field] = True
                            break

            # set hinting field
            hinting_fields(["project_address"])
            hinting_fields(["builder_name", "builder_address"])
            hinting_fields(["eia_name", "eia_address"])
            # set item fields
            if set_field("project_name"):
                set_field("project_address")
            set_field("project_investment")
            if set_field("builder_name"):
                set_field("builder_address")
            #print self.hintings
            if set_field("eia_name"):
                set_field("eia_address")
            if set_field("start_date"):
                res = self.date_regexp.match(line)
                if res:
                    digits = res.groups()
                    try:
                        self.dates.append(WorkDay(Digit(digits[0]), Digit(digits[2]), Digit(digits[4])))
                    except:
                        pass
            # append fields
            append_field("pollutions")
            # extract fields
            if post_lapse_time is None:
                lapse = extract_field("lapse_time")
                if lapse:
                    post_lapse_time = Digit(lapse)

        # sort all dates occuring in content
        # and the first date must be start date of this project
        # if there is no explicit lapse time we can use the last date as end date of this project
        self.dates = sorted(self.dates)
        if self.dates:
            item["post_start_date"] = str(self.dates[0] + 0)
        if self.dates and post_lapse_time:
            item["post_end_date"] = str(self.dates[0].within(post_lapse_time))
        elif len(self.dates) > 1:
            item["post_end_date"] = str(self.dates[-1] + 0)
        if item.get("project_investment"):
            investment = Digit(item["project_investment"])
            item["project_investment"] = investment if investment > 0 else None
        print "网页标题：", item.get('title')
        print "网页链接：", item.get('url')
        print "项目名称：", item.get("project_name")
        print "项目地址：", item.get("project_address")
        print "项目投资：", item.get("project_investment")
        print "污染类型：", "、".join(item.get("pollutions").keys())
        print "建设单位：", item.get("builder_name")
        print "单位地址：", item.get("builder_address")
        print "环评单位：", item.get("eia_name")
        print "单位地址：", item.get("eia_address")
        print "公告时间：", item.get("post_start_date"), "-", item.get("post_end_date")
        item["page_content"] = None
        yield item

예제 #16

0

파일 보기

def startanalyse(region, company, keyword, count):
    print '\nModule 3 - analyse html pages to judge keyword related or not.'

    searchkey = '%s+%s' % (company, keyword)

    #file for saving analyzing results
    txtfilename = region + os.sep + company + os.sep + '%s_result.txt' % searchkey
    txtfile = open(txtfilename, 'r')
    txtcont = txtfile.readlines()
    txtfile.close()

    #meta html page file name
    _htmlfilename = region + os.sep + company + os.sep + searchkey + '_%d.html'

    yes = 0
    no = 0

    #pattern: description, keywords, title
    pattern_title = '<title>(.*?)</title>'
    pattern_key = '<meta\s(name=["]?keywords["]?\scontent=\"(.*?)\"|content=\"(.*?)\"\sname=["]?keywords["]?).*?>'  #.*?>: not always end symbol & space character
    pattern_des = '<meta\s(name=["]?description["]?\scontent=\"(.*?)\"|content=\"(.*?)\"\sname=["]?description["]?).*?>'  #.*?>: not always end symbol & space character

    txtlist = []
    tmpfilename = 'tmp.txt'  #temp usage
    for i in range(count):
        tmp = i + 1
        htmlfilename = _htmlfilename % tmp

        company_flag = False
        keyword_flag = False

        #judge html file is NULL or not
        file_size = os.stat(htmlfilename).st_size

        if file_size != 0:
            htmlfile = open(htmlfilename, 'r')
            htmlcontent = htmlfile.read()
            htmlfile.close()

            #1 - head content: description, keywords, title
            head_title = re.search(pattern_title, htmlcontent, re.I | re.S)
            head_key = re.search(pattern_key, htmlcontent, re.I | re.S)
            head_des = re.search(pattern_des, htmlcontent, re.I | re.S)
            #2 - body content: readability
            body_content = Document(htmlcontent).summary()
            tmpfile = open(tmpfilename, 'w')
            tmpfile.write(body_content.encode('utf-8'))
            tmpfile.close()
            tmpfile = open(tmpfilename, 'r')
            body_content = tmpfile.read()
            tmpfile.close()

            #is company related or not?
            if (head_title != None and (company in head_title.group(1))) or (
                    head_key != None and (company in head_key.group(1))) or (
                        head_des != None and (company in head_des.group(1))):
                company_flag = True
            else:
                _company = unicode(company, 'mbcs')
                if _company in body_content:
                    company_flag = True
            #if company not, stop judging
            if company_flag:
                #is keyword related or not?
                if (head_title != None and
                    (keyword in head_title.group(1))) or (
                        head_key != None and
                        (keyword in head_key.group(1))) or (
                            head_des != None and
                            (keyword in head_des.group(1))):
                    keyword_flag = True
                else:
                    _keyword = unicode(keyword, 'mbcs')
                    if _keyword in body_content:
                        keyword_flag = True
        #show results
        print tmp, ' company related:', company_flag, ' keyword related:', keyword_flag

        #store results
        if company_flag and keyword_flag:
            txtlist.append('yes')
        else:
            txtlist.append('no')
        i += 1

    #write back to analyzing result file
    for j in range(len(txtcont)):
        newcont = '*' + txtlist[j] + '\n'
        oldcont = txtcont[j]
        txtcont[j] = oldcont.replace('\n', newcont)
    txtfile = open(txtfilename, 'w')
    txtfile.writelines(txtcont)
    txtfile.close()

    if os.path.exists(tmpfilename) == True:
        os.remove(tmpfilename)

예제 #17

0

파일 보기

파일: Final_submited.py 프로젝트: kshitij108/project_p

#Here is my code.
# importing file.
from readability.readability import Document
import urllib2
# Get Users URL
URL= "http://arstechnica.com/science/2017/01/texas-slams-fda-with-lawsuit-for-holding-up-imported-execution-drugs/"
#req = urllib2.Request(URL)
# putting it in to object.
print ("url is - " + URL)
#URL = URL.strip('\'"')
#print ("new url is - " + URL)
fURL = urllib2.urlopen(URL)
#Making Html file 
htmlName="decrufted.html"
htmlThing = open(htmlName,'w')
#htmlThing.write(Document(fURL.read()).summary())
#first i printed on this shall
# and than i thought that i could store that in to veriable. and i stored in to string veriable.
#print Document(fURL.read()).summary()
#making srting object.
strHtmlStuff = Document(fURL.read()).summary()
#Writing stuff from string object.
htmlThing.write(strHtmlStuff.encode('utf8') + '\n')
htmlThing.close
print "The file name is: " + htmlName
# there u go with html file.

예제 #18

0

파일 보기

#!/usr/bin/env python

import urllib.request, urllib.parse, urllib.error
import sys

from readability.readability import Document

url = sys.argv[1]

#url = "http://www.space.com/29740-mice-of-mars-rodents-pave-way-to-red-planet.html"

html = urllib.request.urlopen(url).read()
readable_article = Document(html).summary()
readable_title = Document(html).short_title()

print(readable_title)

print(readable_article.encode('utf-8').strip())