Exemplo n.º 1
0
def get_plain_from_html(html):
    textout = StringIO()
    formtext = AbstractFormatter(DumbWriter(textout))
    parser = HTMLParser(formtext)
    parser.feed(html)
    parser.close()
    return textout.getvalue()
Exemplo n.º 2
0
class Retrive(object):
    def __init__(self, url):
        self.url = url
        self.file = self.filename(url)

    def filename(self, url, deffile='index.php'):
        parsedurl = urlparse(url,'http:', 0)
        path = parsedurl[1] + parsedurl[2]
        ext = splitext(path)
        if ext[1] == '':
            if path[-1] == '/':
                path += deffile
            else:
                path += '/' + deffile
        ldir = dirname(path)
        if sep != '/':
            ldir = replace(ldir, '/', sep)
        if not isdir(ldir):
            if exists(ldir):
                unlink(ldir)
            makedirs(ldir)
        return path

    def download(self):
        try:
            retval = urlretrieve(self.url, self.file)
        except IOError:
            retval = 'error'
            return retval

    def parseAndGetLinks(self):
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
Exemplo n.º 3
0
class Retriever(object):
	def __init__(self,url):
		self.url=url
		self.file=self.filename(url)
	def filename(self,url,deffile='index.html'):
		parsedurl=urlparse(url,'http:',0)
		path=parsedurl[1]+parsedurl[2]	#weibo.com+/gothack
		ext=splitext(path) #weibo.com/gothack , ''  #split by .
		if ext[1]=='':
			if path[-1]=='/':
				path+=deffile
			else:
				path+='/'+deffile
		ldir=dirname(path)	#weibo.com #before the last /
		if sep != '/':	#default value is /
			ldir=replce(ldir,'/',sep)	#replace  / with sep #(string,old,new)
		if not isdir(ldir):
			if exists(ldir):unlink(ldir)
			makedirs(ldir)
		return path
	def download(self):
		try:
			retval=urlretrieve(self.url,self.file)
		except IOError:
			retval=('***ERROR: invalid URL "%s"' %self.url,)
		return retval
	def parseAndGetLinks(self):
		self.parser=HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
		self.parser.feed(open(self.file).read())
		self.parser.close()
		return self.parser.anchorlist
Exemplo n.º 4
0
class Restriever(object):
    def __init__(self, url):
        self.url = url
        self.file = self.filename(url)

    def filename(self, url, deffile = 'index.htm'):
        parsedurl = urlparse(url, 'http:', 0)
        path = parsedurl[1] + parsedurl[2]
        ext = splitext(path)
        if ext[1] == '':
            if path[-1] == '/':
                path += deffile
            else:
                path += '/' + deffile

        ldir = dirname(path)
        if sep != '/':
            ldir = replace(ldir, '/', sep)
        if not isdir(ldir):
            if exists(ldir) : unlink(ldir)
            makedirs(ldir)
        return path

    def download(self):
        try:
            retval = urlretrieve(self.url, self.file)
        except IOError:
            retval = ('*** ERROR: invalid URL "%s"' % self.url)
            return retval

    def parseAndGetLinks(self):
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
Exemplo n.º 5
0
class Retriever(object):  #下载网页类
    def __init__(self, url):
        self.url = url
        self.file = self.filename(url)

    def filename(self, url, deffile="index.htm"):
        parsedurl = urlparse(url, "http:", 0)  #解析路径
        path = parsedurl[1] + parsedurl[2]
        ext = splitext(path)
        if ext[1] == "":  #如果没有文件,使用默认
            if path[-1] == "/":
                path += deffile
            else:
                path += "/" + deffile
        ldir = dirname(path)  #本地目录
        if sep != "/":
            ldir = replace(ldir, "/", sep)
        if not isdir(ldir):  #如果没有目录,创建一个
            if exists(ldir): unlink(ldir)
            makedirs(ldir)
        return path

    def download(self):  # 下载网页
        try:
            retval = urlretrieve(self.url, self.file)
        except IOError:
            retval = ('***Error: invalid URL: "%s"' % self.url, )
        return retval

    def parseAndGetLinks(self):  #解析HTML,保存链接
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
Exemplo n.º 6
0
 def OpenURL(self,url):
     from htmllib import HTMLParser
     import formatter
     self.url = url
     m = re.match('http://([^/]+)(/\S*)\s*', url)
     if m:
         host = m.groups()[0]
         path = m.groups()[1]
     else:
         m = re.match('http://(\S+)\s*', url)
         if not m:
             # Invalid URL
             self.logprint("Invalid or unsupported URL: %s" % (url))
             return
         host = m.groups()[0]
         path = ''
     f = self.RetrieveAsFile(host,path)
     if not f:
         self.logprint("Could not open %s" % (url))
         return
     self.logprint("Receiving data...")
     data = f.read()
     tmp = open('hangman_dict.txt','w')
     fmt = formatter.AbstractFormatter(formatter.DumbWriter(tmp))
     p = HTMLParser(fmt)
     self.logprint("Parsing data...")
     p.feed(data)
     p.close()
     tmp.close()
Exemplo n.º 7
0
def create_plaintext_message(message):
    """ Create clean plain text version of email message

        Parse the html and remove style and javacript tags and then
        create a plain-text-message by parsing the html
        and attaching links as endnotes
    """
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.kill_tags = ['style']
    doc = message.decode('utf-8', 'ignore')
    to_clean = lxml.html.fromstring(doc)
    cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean))
    plain_text_maxcols = 72
    textout = cStringIO.StringIO()
    formtext = formatter.AbstractFormatter(
        formatter.DumbWriter(textout, plain_text_maxcols))
    parser = HTMLParser(formtext)
    parser.feed(cleaned_msg)
    parser.close()
    # append the anchorlist at the bottom of a message
    # to keep the message readable.
    counter = 0
    anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
    for item in parser.anchorlist:
        counter += 1
        if item.startswith('https://'):
            new_item = item.replace('https://', 'http://')
        else:
            new_item = item
        anchorlist += "[%d] %s\n" % (counter, new_item)
    text = textout.getvalue() + anchorlist
    del textout, formtext, parser, anchorlist
    return text
Exemplo n.º 8
0
 def create_plaintext_message(self, text):
     """ Create a plain-text-message by parsing the html
         and attaching links as endnotes
     """
     plain_text_maxcols = 72
     textout = cStringIO.StringIO()
     formtext = formatter.AbstractFormatter(formatter.DumbWriter(
                                            textout, plain_text_maxcols))
     parser = HTMLParser(formtext)
     parser.feed(text)
     parser.close()
     # append the anchorlist at the bottom of a message
     # to keep the message readable.
     counter = 0
     anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
     for item in parser.anchorlist:
         counter += 1
         if item.startswith('https://'):
             new_item = item.replace('https://', 'http://')
         else:
             new_item = item
         anchorlist += "[%d] %s\n" % (counter, new_item)
     text = textout.getvalue() + anchorlist
     del textout, formtext, parser, anchorlist
     return text
Exemplo n.º 9
0
class Retriever(object):
    def __init__(self,url):
        self.url=url
        self.file=self.filename(url)
    def filename(self,url,deffile='index.html'):
        """
        生成下载连接和文件名
        """
        full_url = ""
        if url.endswith(DOM_SUFFIX):
            full_url = url + '/'
        else:
            full_url = url
        parsedurl=urlparse(full_url,'http:',0)
        path=parsedurl[1]+parsedurl[2]
        ext=splitext(path)
        if ext[1]=='':
            if path[-1]=='/':
                path+=deffile
            else:
                path+='/'+deffile
        ldir=dirname(path)
        if sep!='/':
            ldir=replace(ldir,'/',sep)
            path=replace(path,'/',sep)

        if not isdir(ldir):
            if exists(ldir):
                    #unlink(ldir)
                pass
            else:    
                makedirs(ldir)
        print path
        return path
            
    def download(self):
        """
        下载文件
        """
        try:
            retval=urlretrieve(self.url,self.file)
        except IOError:
            retval=('***ERROR :invalid URL "%s"' %self.url,)
        return retval   
    
    def parseAndGetLinks(self):
        """
        获取网页中的链接
        """
        #print 'Get Html Links from file:%s' % self.file
        #self.parser=HTMLParser(AbstractFormatter(DumbWriter(StringIO)))
        self.parser=HTMLParser(NullFormatter())
        #self.parser.feed(open(self.file).read())
        try:
            self.parser.feed(open(self.file).read())
            self.parser.close()
            return self.parser.anchorlist
        except:
            print self.file + " error !"
            return [] 
Exemplo n.º 10
0
class Retriever(htmllib.HTMLParser): #download Web Pages
	
	def __init__(self, url):
		self.url = url
		self.file = self.filename(url)

	def filename(self, url, deffile='index.htm'):
		parsedurl = urlparse(url,'http:',0) #parse path
		path = parsedurl[1] + parsedurl[2]
		text = splitext(path)
		if text[1]=='': #its not file use default
			if text[-1] == '/':
				path = path + deffile
			else:
				path = path + '/' + deffile
		print "PATH:%s" % path
		dir = dirname(path)
		if not isdir(dir): #create new archieve dir if necessary
			if exists(dir): unlink(dir)
			makedirs(dir)
		return path
	
	def download(self): #download web pages
		try:
			retval = urlretrieve(self.url,self.file)
		except IOError:
			retval =('***ERROR: invalid URL "%s"' % self.url)
		
		return retval
	def parseAndGetLinks(self): #Parse HTML
		self.parser = HTMLParser(AbstractFormatter(\
				DumbWriter(StringIO())))
		self.parser.feed(open(self.file).read())
		self.parser.close()
		return self.parser.anchorlist
Exemplo n.º 11
0
 def OpenURL(self, url):
     from htmllib import HTMLParser
     import formatter
     self.url = url
     m = re.match('http://([^/]+)(/\S*)\s*', url)
     if m:
         host = m.groups()[0]
         path = m.groups()[1]
     else:
         m = re.match('http://(\S+)\s*', url)
         if not m:
             # Invalid URL
             self.logprint("Invalid or unsupported URL: %s" % (url))
             return
         host = m.groups()[0]
         path = ''
     f = self.RetrieveAsFile(host, path)
     if not f:
         self.logprint("Could not open %s" % (url))
         return
     self.logprint("Receiving data...")
     data = f.read()
     tmp = open('hangman_dict.txt', 'w')
     fmt = formatter.AbstractFormatter(formatter.DumbWriter(tmp))
     p = HTMLParser(fmt)
     self.logprint("Parsing data...")
     p.feed(data)
     p.close()
     tmp.close()
Exemplo n.º 12
0
class Retriever(object):#下载网页类

    def __init__(self,url):
        self.url = url
        self.file = self.filename(url)

    def filename(self,url,deffile ="index.htm"):
        parsedurl = urlparse(url,"http:",0) #解析路径
        path = parsedurl[1] + parsedurl[2]
        ext = splitext(path)
        if ext[1] == "": #如果没有文件,使用默认
            if path[-1] == "/":
                path += deffile
            else:
                path += "/" + deffile
        ldir = dirname(path) #本地目录
        if sep != "/":
            ldir = replace(ldir,"/",sep)
        if not isdir(ldir): #如果没有目录,创建一个
            if exists(ldir):unlink(ldir)
            makedirs(ldir)
        return path

    def download(self):# 下载网页
        try:
            retval = urlretrieve(self.url,self.file)
        except IOError:
            retval = ('***Error: invalid URL: "%s"' % self.url,)
        return retval

    def parseAndGetLinks(self): #解析HTML,保存链接
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
Exemplo n.º 13
0
class Retriever(object):  # download Web pages

    def __init__(self, url):
        self.url = url
        self.file = self.filename(url)

    def filename(self, url, deffile='index.htm'):
        parsedurl = urlparse(url, 'http:', 0)  ## parse path
        path = parsedurl[1] + parsedurl[2]
        ext = splitext(path)
        if ext[1] == '':  # no file, use default
            if path[-1] == '/':
                path += deffile
        else:
            path += '/' + deffile
        ldir = dirname(path)  # local directory
        if sep != '/':  # os-indep. path separator
            ldir = replace(ldir, '/', sep)
        if not isdir(ldir):  # create archive dir if nec.
            if exists(ldir): unlink(ldir)
        makedirs(ldir)
        return path

    def parseAndGetLinks(self):
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
Exemplo n.º 14
0
def create_plaintext_message(message):
        """ Create clean plain text version of email message

            Parse the html and remove style and javacript tags and then
            create a plain-text-message by parsing the html
            and attaching links as endnotes
        """
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.kill_tags = ['style']
        doc = message.decode('utf-8', 'ignore')
        to_clean = lxml.html.fromstring(doc)
        cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean))
        plain_text_maxcols = 72
        textout = cStringIO.StringIO()
        formtext = formatter.AbstractFormatter(formatter.DumbWriter(
                                               textout, plain_text_maxcols))
        parser = HTMLParser(formtext)
        parser.feed(cleaned_msg)
        parser.close()
        # append the anchorlist at the bottom of a message
        # to keep the message readable.
        counter = 0
        anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
        for item in parser.anchorlist:
            counter += 1
            if item.startswith('https://'):
                new_item = item.replace('https://', 'http://')
            else:
                new_item = item
            anchorlist += "[%d] %s\n" % (counter, new_item)
        text = textout.getvalue() + anchorlist
        del textout, formtext, parser, anchorlist
        return text
Exemplo n.º 15
0
class Retriever(object):    # download Web pages
    def __init__(self, url):
        self.url = url
        self.file = self.filename(url)

    def filename(self, url, deffile = 'index.htm'):
        parsedurl = urlparse(url, 'http:', 0)   # parse path
        path = parsedurl[1] + parsedurl[2]
        ext = splitext(path)
        if ext[1] == '': # no file, use default
            if path[-1] == '/':
                path += deffile
            else:
                path += '/' + deffile
        ldir = dirname(path)    # local directory
        if sep != '/':  
            ldir = replace(ldir, '/', sep)
        if not isdir(ldir):
            if exists(ldir): 
                unlink(ldir)
                makedirs(ldir)
        return path

    def download(self):
        try:
            retval = urlretrieve(self.url, self.file)
        except IOError:
            retval = ('*** ERROR: invalid URL "%s"' % self.url)
        return retval

    def parseAndGetLinks(self):
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
Exemplo n.º 16
0
class Retriever(object):  # download Web pages
    def __init__(self, url):
        self.url = url
        self.file = self.filename(url)

    def filename(self, url, deffile='index.htm'):
        parsedurl = urlparse(url, 'http:', 0)
        path = parsedurl[1] + parsedurl[2]
        ext = splitext(path)
        if ext[1] == '':  # no file, use default
            if path[-1] == '/':
                path += deffile
            else:
                path += '/' + deffile
        ldir = dirname(path)  # local directory
        if sep != '/':  # os-indep. path separator
            ldir = replace(ldir, '/', sep)
        if not isdir(ldir):  # create archive dir if nec.
            if exists(ldir):
                unlink(ldir)
            makedirs(ldir)
        return path

    def download(self):  # download Web page
        try:
            retval = urlretrieve(self.url, self.file)
        except IOError:
            retval = ('*** ERROR: invalid URL "%s"' % self.url, )
        return retval

    def parseAndGetLinks(self):  # parse HTML, save links
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
    def create_plaintext_message(self, text):
        """ Create a plain-text-message by parsing the html
            and attaching links as endnotes

            Modified from EasyNewsletter/content/ENLIssue.py
        """
        # This reflows text which we don't want, but it creates
        # parser.anchorlist which we do want.
        textout = StringIO.StringIO()
        formtext = formatter.AbstractFormatter(formatter.DumbWriter(textout))
        parser = HTMLParser(formtext)
        parser.feed(text)
        parser.close()

        # append the anchorlist at the bottom of a message
        # to keep the message readable.
        counter = 0
        anchorlist = "\n\n" + '----' + "\n\n"
        for item in parser.anchorlist:
            counter += 1
            anchorlist += "[%d] %s\n" % (counter, item)

        # This reflows text:
        # text = textout.getvalue() + anchorlist
        # This just strips tags, no reflow
        text = html.fromstring(text).text_content()
        text += anchorlist
        del textout, formtext, parser, anchorlist
        return text
Exemplo n.º 18
0
 def parse_links(self):
     'Parse out the links found in downloaded HTML file'
     f = open(self.file, 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
Exemplo n.º 19
0
def get_urls(url):
    data = urllib.urlopen(url).read()
    parser = HTMLParser(
        formatter.AbstractFormatter(formatter.DumbWriter(
            cStringIO.StringIO())))
    parser.feed(data)
    parser.close()
    url_list = parser.anchorlist
    return url_list
Exemplo n.º 20
0
def html2text(html):
    f = StringIO()
    parser = HTMLParser(AbstractFormatter(DumbWriter(f)))
    try:
        parser.feed(html)
    except HTMLParseError:
        return ''
    else:
        parser.close()
        return f.getvalue()
Exemplo n.º 21
0
 def parse_links(self):
     f = open(self.file, 'r')
     data = f.read()
     f.close()
     paeser = HTMLParser(
         formatter.AbstractFormatter(
             formatter.DumbWriter(cStringIO.StringIO())))
     paeser.feed(data)
     paeser.close()
     return paeser.anchorlist
Exemplo n.º 22
0
 def parse_links(self):
     'Parse out the links found in downloaded HTML file'
     f = open(self.file, 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(
         cStringIO.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
Exemplo n.º 23
0
 def parse_links(self):
     """fetch all links from page
     """
     f = open(self.save_file, 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
class Retriever(object):
	def __init__(self,url):
		self.url = url

	#parse HTML ,save links
	def parseAndGetLinks(self):
		self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
		self.parser.feed(urlopen(self.url).read())
		self.parser.close()
		return self.parser.anchorlist
Exemplo n.º 25
0
 def parse_link(seld):
     'Parse out the link'
     f = open('seld.file', 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(
         formatter.AbstractFormatter(
             formatter.DumbWriter(cStringIO.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
Exemplo n.º 26
0
def get_text_from_html( html_input ):
  "Strip tags and non-ascii characters from HTML input."
  my_stringio = StringIO.StringIO() # make an instance of this file-like string thing
  p = HTMLParser(AbstractFormatter(DumbWriter(my_stringio)))
  try: p.feed(html_input); p.close() #calling close is not usually needed, but let's play it safe
  except HTMLParseError: print '***HTML malformed***' #the html is badly malformed (or you found a bug)
  #return my_stringio.getvalue().replace('\xa0','')
  s = re.sub( r'[^\x00-\x7f]', r' ', my_stringio.getvalue() )
  s = s.replace('\r\n',' ').replace('\n',' ')
  s = re.sub( ' +', ' ', s )
  return s
Exemplo n.º 27
0
 def parse_links(self):
     """fetch all links from page
     """
     f = open(self.save_file, 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(
         formatter.AbstractFormatter(
             formatter.DumbWriter(cStringIO.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
Exemplo n.º 28
0
 def parse_links(self):
     f = open(self.file, "r")
     data = f.read()
     f.close()
     parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO())))
     # parser = MyHTMLParser()
     parser.feed(data)
     parser.close()
     # 没有在模块中找到 anchorlist属性
     # 返回页面中所有的锚点,href
     # 该属性在2.6后就被弃用了,如果想用可以通过自定义parser继承HTMLParser来实现,参见evernote.或者用第三方库beautifulsoup
     return parser.anchorlist
Exemplo n.º 29
0
 def parse(self, url):
     self.base = ""
     self.href = ""
     m = re.compile(".*/").match(url)
     if m != None:
         self.base = m.string[m.start(0):m.end(0)]
     result = urlfetch.fetch(url, headers = {'Cache-Control' : 'max-age=30', 'Pragma' : 'no-cache'} )
     if result.status_code == 200:
         logging.debug(str(result.status_code) + " OK " + url)
         HTMLParser.feed(self, result.content)
         HTMLParser.close(self)
     else:
         logging.error(str(result.status_code) + " NG " + url)
Exemplo n.º 30
0
class Retriever(object):

    def __init__(self, url):
        self.url = url
        self.file = self.filename(url)

    def filename(self, url, deffile='index.htm'):
        # 使用http协议解析url地址,将url解析为这样的五元组:(scheme, netloc, path, query, fragment)
        parsedurl = urlparse(url, 'http:', 0)
        # 将主机地址(netloc)和路径(path)合并起来作为存储文件的路径名
        path = parsedurl[1] + parsedurl[2]
        # splitext将path分割为路径与后缀名,如(/path/to/file, txt)
        ext = splitext(path)
        # 如果后缀名为空,则给当前path添加默认的名字index.htm
        if ext[1] == '':
            # 若path结尾有“/”则直接添加index.htm,否则先添加“/”
            if path[-1] == '/':
                path += deffile
            else:
                path += '/' + deffile
        # 取出path中的目录部分(www.shellbye.com\\blog),然后与本地目录结合为一个目录('D:\\www.shellbye.com\\blog')
        ldir = dirname(abspath(path))
        # 如果不是以“/”作为目录分割符的类Unix系统,比如Windows,则需要把“”替换为相应的目录分割符
        # 因为类Unix系统的目录分割符与URI地址的分割符一样,所以可以不处理
        if sep != '/':        # os-indep. path separator
            ldir = replace(ldir, '/', sep)
        # 如果ldir目录不存在在创建
        if not isdir(ldir):      # create archive dir if nec.
            # 如果ldir存在但是不是目录则删除ldir。注,unlink即remove。
            if exists(ldir):
                unlink(ldir)
            makedirs(ldir)
        return path

    def download(self):        # download Web page
        try:
            # 下载self.url到self.file里
            retval = urllib.urlretrieve(self.url, self.file)
        except IOError:
            retval = ('*** ERROR: invalid URL "%s"' % self.url, )
        return retval

    def parseAndGetLinks(self):
        # 创建一个基本的HTML解释器,可能需要单独一篇文章来说这句
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        # 解析html文件,获取所有的连接(带有href的)
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
Exemplo n.º 31
0
class Retriever(object):#download web page
    def __init__(self, url):
        self.url = url
        self.file = self.filename(url)

    #local filename ,directory
    def filename(self, url, deffile = "index.htm"):
        parseurl = urlparse(url, 'http:', 0)
        path = parseurl[1] + parseurl[2]

        #将路径转换为一个元组,如果为目录则第二个元素为空,如果文件则第二个元素为文件扩展名
        #path = "D:/pycharmProjects/PythonWebApp/weblearning/Crawl.py"
        #print splitext(path)
        #('D:/pycharmProjects/PythonWebApp/weblearning/Crawl', '.py')

        ext = splitext(path)
        if ext[1] == '':#no file use default
            #tuple[-index],倒数第index个元素
            if path[-1] == '/':
                path += deffile
            else:
                path += '/' + deffile
        #获取path的dir,
        # path=D:/pycharmProjects/PythonWebApp/webLearning
        # dir(path)=D:/pycharmProjects/PythonWebApp
        ldir = dirname(path)#local directory
        if sep != '/': #os-indep. path separator
            ldir = replace(ldir, '/', sep)
        if not isdir(ldir): #create archive dir if nec.
            #如果存在文件,则删除
            if exists(ldir):
                unlink(ldir)
            makedirs(ldir)
        return path

    def download(self):
        #urlretrieve()返回一个2元组,(filename,mime_hdrs)
        try:
            retval = urlretrieve(self.url, self.file)
        except IOError:
            retval = ('*** ERROR: invalid URL "%s"' %self.url,)
            print 'erro,invalid url'
        return retval

    def parseAndGetLinks(self):#parse HTML , save links
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
Exemplo n.º 32
0
class Retriever(object):
    def __init__(self, url):
        self.url = url
        self.file = self.filename(url)

    def filename(self, url, deffile='index.htm'):
        # 使用http协议解析url地址,将url解析为这样的五元组:(scheme, netloc, path, query, fragment)
        parsedurl = urlparse(url, 'http:', 0)
        # 将主机地址(netloc)和路径(path)合并起来作为存储文件的路径名
        path = parsedurl[1] + parsedurl[2]
        # splitext将path分割为路径与后缀名,如(/path/to/file, txt)
        ext = splitext(path)
        # 如果后缀名为空,则给当前path添加默认的名字index.htm
        if ext[1] == '':
            # 若path结尾有“/”则直接添加index.htm,否则先添加“/”
            if path[-1] == '/':
                path += deffile
            else:
                path += '/' + deffile
        # 取出path中的目录部分(www.shellbye.com\\blog),然后与本地目录结合为一个目录('D:\\www.shellbye.com\\blog')
        ldir = dirname(abspath(path))
        # 如果不是以“/”作为目录分割符的类Unix系统,比如Windows,则需要把“”替换为相应的目录分割符
        # 因为类Unix系统的目录分割符与URI地址的分割符一样,所以可以不处理
        if sep != '/':  # os-indep. path separator
            ldir = replace(ldir, '/', sep)
        # 如果ldir目录不存在在创建
        if not isdir(ldir):  # create archive dir if nec.
            # 如果ldir存在但是不是目录则删除ldir。注,unlink即remove。
            if exists(ldir):
                unlink(ldir)
            makedirs(ldir)
        return path

    def download(self):  # download Web page
        try:
            # 下载self.url到self.file里
            retval = urllib.urlretrieve(self.url, self.file)
        except IOError:
            retval = ('*** ERROR: invalid URL "%s"' % self.url, )
        return retval

    def parseAndGetLinks(self):
        # 创建一个基本的HTML解释器,可能需要单独一篇文章来说这句
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        # 解析html文件,获取所有的连接(带有href的)
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
Exemplo n.º 33
0
def get_plain_from_html(html):
    """extract plain text from html

    >>> test_html = "<div><h1>Hey<h1><p>This is some text</p></div>"
    >>> get_plain_from_html(test_html)
    '\\nHey\\n\\nThis is some text'

    """
    from htmllib import HTMLParser  # import here to avoid high startup cost

    textout = StringIO()
    formtext = AbstractFormatter(DumbWriter(textout))
    parser = HTMLParser(formtext)
    parser.feed(html)
    parser.close()
    return textout.getvalue()
Exemplo n.º 34
0
def get_plain_from_html(html):
    """extract plain text from html

    >>> test_html = "<div><h1>Hey<h1><p>This is some text</p></div>"
    >>> get_plain_from_html(test_html)
    '\\nHey\\n\\nThis is some text'

    """
    from htmllib import HTMLParser  # import here to avoid high startup cost

    textout = StringIO()
    formtext = AbstractFormatter(DumbWriter(textout))
    parser = HTMLParser(formtext)
    parser.feed(html)
    parser.close()
    return textout.getvalue()
Exemplo n.º 35
0
class Retriever:
    '''
    responsibilities:
    download, parse and queue
    '''
    def __init__(self,url):
        '''
        contructor of class.Instantiates the Retriver object and stores the url and filename as local attributes
        '''
        self.url = url
        self.file = self.filename(url)

    def filename(self, url, deffile = 'index.html'):
        '''
        input: url
        removes the http prefix
        index.html will be the default file name for storage of the url:this can be overridden by passing arguments to filename()
        '''
        parsedurl = urlparse(url,"http:",0) #parse path
        path = parsedurl[1] + parsedurl[2]
        text = splitext(path)
        if text[1] == '': #no file, use default
            path  =  path + deffile
        else:
            path = path + '/' + deffile
        dir = dirname(path)
        if not isdir(dir):  #create a new directory if necessary
            if exists(dir): unlink(dir)
            makedirs(dir)
        return path

    def download(self): #download web page
        try:
            retval = urlretrieve(self.url,self.file)
        except IOError:
            retval = ('***ERROR invalid url "%s"'%self.url,)
        return retval

    def parseAndGetLinks(self): #parse HTML and getlinks
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        #try:
        self.parser.feed(open(self.file).read())
        #except HTMLParseError:
        self.parser.close()

        self.parser.close()
        return self.parser.anchorlist
Exemplo n.º 36
0
 def parse(self, url):
     self.base = ""
     self.href = ""
     m = re.compile(".*/").match(url)
     if m != None:
         self.base = m.string[m.start(0):m.end(0)]
     result = urlfetch.fetch(url,
                             headers={
                                 'Cache-Control': 'max-age=30',
                                 'Pragma': 'no-cache'
                             })
     if result.status_code == 200:
         logging.debug(str(result.status_code) + " OK " + url)
         HTMLParser.feed(self, result.content)
         HTMLParser.close(self)
     else:
         logging.error(str(result.status_code) + " NG " + url)
Exemplo n.º 37
0
Arquivo: crawl.py Projeto: wengowl/gae
class Retriever():  # download web pages
    def __init__(self, url):
        self.url = url

    def download(self):  # download web page
        print 'try to open url:', self.url, '\nthe true url process', string.split(
            self.url, '?')[0]
        try:
            retval = urlopen(string.split(self.url, '?')[0], None, 200)
        except urllib2.HTTPError as e:
            print "HTTPError", e
            return
        except socket.timeout as e:
            print "socket.timeout", e
            return
        except socket.error as e:
            print "socket.error", e
            return
        except urllib2.URLError as e:
            print "URLError: ", e
            return
        return retval

    def parseAndGetLinks(self):  # parse HTML, save links
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        r = self.download()
        if r:
            print '________'
            try:
                try:
                    s = r.read(50000)
                except socket.error as e:
                    print "***************************socket error***************************", e
                    return []
                self.parser.feed(s)
                print '------------------'

                r.close()
                print '***************************'
            except HTMLParseError:
                print 'get links error\n'
                return []

        self.parser.close()
        return self.parser.anchorlist
Exemplo n.º 38
0
class LinkFinder(object):
    def __init__(self, base_url, page_url):
        self.base_url = base_url
        self.page_url = page_url

    def parseAndGetLinks(self, html_string):
        try:
            self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
            self.parser.feed(html_string)
            self.parser.close()
            links = []
            for eachLink in self.parser.anchorlist:
                if eachLink[:4] != "http" and find(eachLink, "://") == -1:
                    eachLink = urljoin(self.base_url, eachLink)
                links.append(eachLink)
            return links
        except IOError:
            return []
Exemplo n.º 39
0
 def compactor(dev_filename, rel_filename):
     # Use compactor to generate release version.
     echo('Compacting: %s -> %s' % (dev_filename, rel_filename))
     source_data = open(dev_filename, 'r').read()
     try:
         # Verify that the html file is correct
         htmlparser = HTMLParser(NullFormatter())
         htmlparser.feed(source_data)
         htmlparser.close()
         # Now try to minify
         output_file = open(rel_filename, 'wb')
         compactor = HTMLMinifier(output_file.write, True)
         compactor.feed(source_data)
         compactor.close()
         output_file.close()
     except HTMLParseError as e:
         error(str(e))
         exit(1)
Exemplo n.º 40
0
 def compactor(dev_filename, rel_filename):
     # Use compactor to generate release version.
     echo('Compacting: %s -> %s' % (dev_filename, rel_filename))
     source_data = open(dev_filename, 'r').read()
     try:
         # Verify that the html file is correct
         htmlparser = HTMLParser(NullFormatter())
         htmlparser.feed(source_data)
         htmlparser.close()
         # Now try to minify
         output_file = open(rel_filename, 'wb')
         compactor = HTMLMinifier(output_file.write, True)
         compactor.feed(source_data)
         compactor.close()
         output_file.close()
     except HTMLParseError as e:
         error(str(e))
         exit(1)
Exemplo n.º 41
0
Arquivo: crawl.py Projeto: wengowl/gae
class Retriever():  # download web pages
    def __init__(self, url):
        self.url = url

    def download(self):  # download web page
        print 'try to open url:', self.url, '\nthe true url process', string.split(self.url, '?')[0]
        try:
            retval = urlopen(string.split(self.url, '?')[0], None, 200)
        except urllib2.HTTPError as e:
            print "HTTPError", e
            return
        except socket.timeout as e:
            print "socket.timeout", e
            return
        except socket.error as e:
            print "socket.error", e
            return
        except urllib2.URLError as e:
            print "URLError: ", e
            return
        return retval

    def parseAndGetLinks(self):  # parse HTML, save links
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        r = self.download()
        if r:
            print '________'
            try:
                try:
                    s = r.read(50000)
                except socket.error as e:
                    print "***************************socket error***************************", e
                    return []
                self.parser.feed(s)
                print '------------------'

                r.close()
                print '***************************'
            except HTMLParseError:
                print 'get links error\n'
                return []

        self.parser.close()
        return self.parser.anchorlist
Exemplo n.º 42
0
def collectURLSFromPage(page):
    """
    This returns a list of URLS that come from a certain page.
    Useful for spiders. It takes just a string as an argument.
    """

    resultList = []
    if page == "":
        #nothing to parse, so nothing to return
        return resultList

    #print "Doing form parser"
    if page.count("<form") > 0:
        otherlist = daveFormParse(page)
        for key in otherlist:
            resultList.append(key)
            pass

    #DEBUG
    #return resultList

    #print "Doing RAW Parser"
    spamList = rawParse(page)
    for key in spamList:
        resultList.append(key)
        pass

    #This needs to be documented somehow, but I have no idea what it does
    try:
        parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        parser.feed(page)
        parser.close()

    except:
        #print "DEBUG: Caught an exception trying to parse that html file."
        #print "(Not sure why this happens - you'll have to crawl this page manually)"
        return resultList

    #print "Adding HTML Parser data"
    for key in parser.anchorlist:
        resultList.append(key)
        pass

    return resultList
Exemplo n.º 43
0
class Retriever(object):	# download Web pages
	"""docstring for Retriever"""
	def __init__(self, url):
		self.url = url
		self.file = self.filename(url)
		
	def filename(self, url, deffile='index.html'):
		parsedurl = urlparse(url, 'http:', 0)	# parse path
		print '====PARSEDURL====',parsedurl
		if parsedurl[2] == '':
			path = parsedurl[1] + '/'
		else:
			path = parsedurl[1] + parsedurl[2]
		print '------PATH-----', path
		ext = splitext(path)
		print '-----EXT----', ext
		if ext[1] == '':	# no file, use default
			if path[-1] == '/':
				path +=deffile
			else:
				path += '/' + deffile
		ldir = dirname(path)	# local directory
		print '+++++++++++++++++', ldir
		if sep != '/':	# os-indep. path separator
			ldir = replace(ldir, '/', sep)
		if not isdir(ldir):	# create archive dir if nec.
			if exists(ldir): unlink(ldir)
			makedirs(ldir)
		return path

	def download(self):	# download Web page
		try:
			retval = urlretrieve(self.url, self.file)
		except IOError:
			retval = ('*** ERROR: invalid URL "%s"' % self.url,)
		return retval

	def parseAndGetLinks(self):	# parse HTML, save links
		# self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO)))
		self.parser = HTMLParser(AbstractFormatter(DumbWriter()))
		self.parser.feed(open(self.file).read())
		self.parser.close()
		return self.parser.anchorlist
Exemplo n.º 44
0
class LinkFinder(object):

    def __init__(self,base_url,page_url):
        self.base_url = base_url
        self.page_url = page_url

    def parseAndGetLinks(self,html_string):
        try:
            self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
            self.parser.feed(html_string)
            self.parser.close()
            links = []
            for eachLink in self.parser.anchorlist:
                if eachLink[:4] != "http" and find(eachLink, "://") == -1:
                    eachLink = urljoin(self.base_url, eachLink)
                links.append(eachLink)
            return links
        except IOError:
            return []
Exemplo n.º 45
0
Arquivo: crawl.py Projeto: DaZhu/all
class Retriever(object):	# download Web pages

	def __init__(self, url):
		self.url = url
		self.file = self.filename(url)
		

	def filename(self, url, deffile='index.htm'):
		parsedurl = urlparse(url, 'http:', 0)  # parse path
		path = parsedurl[1] + parsedurl[2]
		ext = splitext(path)
		print path
		if ext[1] == '':
			if path[-1] == '/':
				path += deffile
			else:
				path += '/' + deffile
		ldir = dirname(path)	# local directory
		
		if sep != '/':		# os-indep. path separator
			ldir = replace(ldir, ',', sep)
			if not isdir(ldir):	  # create archive dir if nec.
				if exists(ldir): unlink(ldir)
				print ldir, "aaaaaaaaa"
				makedirs(ldir)
				
			return path

	def download(self):		# download Web page
		try:
			retval = urllib.urlretrieve(self.url, self.file)
		except IOError:
			retval = ('*** ERROR: invalid URL "%s"' % \
				self.url, )
		return retval

	def parseAndGetLinks(self):	# pars HTML, save links
		self.parser = HTMLParser(AbstractFormatter( \
			DumbWriter(StringIO())))
		self.parser.feed(open(self.file).read())
		self.parser.close()
		print self.parser
		return self.parser.anchorlist
Exemplo n.º 46
0
class Retriever(object):
    
    def __init__(self,url):
        self.url=url
        self.file=self.filename(url)
    
    def filename(self,url,deffile='index.htm'):
        parsedurl=urlparse(url,'http:',0)
        path=parsedurl[1]+parsedurl[2]
        ext=splitext(path)
        # 爬取的必须是静态html
        if ext[1]=='':
            if path[-1]=='/':
                path+=deffile
            else:
                path+='/'+deffile
        #print path
        # 建立文件目录
        ldir=dirname(path)
        #print ldir
        #将url中的/转为windows的
        if sep!='/':
            ldir=replace(ldir,'/',sep)
        
        if not isdir(ldir):
            if exists(ldir): unlink(ldir)
            makedirs(ldir)
        return path

    def download(self):
        try:
            retval=urlretrieve(self.url,self.file)
        except IOError:
            retval=('***error in url "%s"'%self.url)
        return retval

    def parseAndGetLink(self):
        #构造一个解析器
        self.parser= HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
Exemplo n.º 47
0
class Retriever(object):
    'download web pages'
    
    def __init__(self, url):
        self.url = url
        self.file = self.filename(url)
        
    def filename(self, url, deffile='index.htm'):
        parsedurl = urlparse(url, 'http:', 0)
        path = parsedurl[1] + parsedurl[2]
        print path
        ext = splitext(path)#return (filename, extension)
        if ext[1] == '': #no file, use default
            if path[-1] == '/':
                path += deffile
            else:
                path += '/' + deffile
                
        ldir = dirname(path) #local directory
        if sep != '/': #os-indep.path separator
            ldir = replace(ldir, '/', sep)
        if not isdir(ldir): #create archive dir if nec
            if exists(ldir):
                unlink(ldir)
            makedirs(ldir)
            
        return path
        
    def download(self):
        'download web page'
        try:
            retval = urlretrieve(self.url, self.file)
        except IOError:
            retval = ('*** ERROR: Invalid URL "%s"' %self.url)
        return retval
            
    def parseAndGetLinks(self):
        'parse HTML, save links'
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
Exemplo n.º 48
0
class Retriever(object):
    def __init__(self, url):
        self.url = url
        self.file = self.filename(url)
        print self.file
    def filename(self, url, deffile='index.htm'):
        parsedurl = urlparse(url, 'http', 0)
        path = parsedurl[1] + parsedurl[2]
        ext = splitext(path) #分解文件名的扩展名
        print path,ext
        if ext[1] == '':
            if path[-1] == '/':
                path += deffile
            else:
                path += '/' + deffile
        print path
        ldir = dirname(path)
        #应该和操作系统相关 windows下:sep = \
        if sep != '/':
            ldir = replace(ldir, '/', sep)
        print ldir
        if not isdir(ldir):
            if exists(ldir):
                return
            makedirs(ldir)
        return path

    def download(self):
        try:
            retval = urlretrieve(self.url, self.file)
        except IOError:
            retval = ('*** Error URL "%s"' % self.url)
        return retval

    def parseAndGetLinks(self):
        """StringIO是从内存中读取数据 DumbWriter将事件流转换为存文本文档  AbstractFormatter 类进行格式化
        """
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
Exemplo n.º 49
0
    def create_plaintext_message(self, text):
        """ Create a plain-text-message by parsing the html
            and attaching links as endnotes
        """
        plain_text_maxcols = 72
        textout = cStringIO.StringIO()
        formtext = formatter.AbstractFormatter(
            formatter.DumbWriter(textout, plain_text_maxcols))
        parser = HTMLParser(formtext)
        parser.feed(text)
        parser.close()

        # append the anchorlist at the bottom of a message
        # to keep the message readable.
        anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
        for counter, item in enumerate(parser.anchorlist):
            anchorlist += "[%d] %s\n" % (counter, item)

        text = textout.getvalue() + anchorlist
        del textout, formtext, parser, anchorlist
        return text
Exemplo n.º 50
0
    def _create_plaintext_message(self, text):
        """ Create a plain-text-message by parsing the html
            and attaching links as endnotes
        """
        plain_text_maxcols = 72
        textout = cStringIO.StringIO()
        formtext = formatter.AbstractFormatter(
            formatter.DumbWriter(textout, plain_text_maxcols))
        parser = HTMLParser(formtext)
        parser.feed(text)
        parser.close()

        # append the anchorlist at the bottom of a message
        # to keep the message readable.
        anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
        for counter, item in enumerate(parser.anchorlist):
            anchorlist += "[{0:d}] {1:s}\n".format(counter, item)

        text = textout.getvalue() + anchorlist
        del textout, formtext, parser, anchorlist
        return text
Exemplo n.º 51
0
def collectURLSFromPage(page):

    resultList=[]


    #print "Doing form parser"
    if page.count("<form")>0:
        otherlist=daveFormParse(page)
        for key in otherlist:
            resultList.append(key)
            pass

    #DEBUG
    #return resultList

    #print "Doing RAW Parser"
    spamList=rawParse(page)
    for key in spamList:
        resultList.append(key)
        pass

    #the whole "AbstractFormater()" line is a bunch of crap I copied
    #That needs to be documented somehow, but I have no idea what it does
    try:
        parser=HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        parser.feed(page)
        parser.close()
        
    except:
        #print "DEBUG: Caught an exception trying to parse that html file."
        #print "(Not sure why this happens - you'll have to crawl this page manually)"
        return resultList

    #print "Adding HTML Parser data"
    for key in parser.anchorlist:
            resultList.append(key)
            pass
            
    return resultList
Exemplo n.º 52
0
def collectURLSFromPage(page):

    resultList = []

    #print "Doing form parser"
    if page.count("<form") > 0:
        otherlist = daveFormParse(page)
        for key in otherlist:
            resultList.append(key)
            pass

    #DEBUG
    #return resultList

    #print "Doing RAW Parser"
    spamList = rawParse(page)
    for key in spamList:
        resultList.append(key)
        pass

    #the whole "AbstractFormater()" line is a bunch of crap I copied
    #That needs to be documented somehow, but I have no idea what it does
    try:
        parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        parser.feed(page)
        parser.close()

    except:
        #print "DEBUG: Caught an exception trying to parse that html file."
        #print "(Not sure why this happens - you'll have to crawl this page manually)"
        return resultList

    #print "Adding HTML Parser data"
    for key in parser.anchorlist:
        resultList.append(key)
        pass

    return resultList
Exemplo n.º 53
0
class Retriever(object): #download Web pages
    def __init__(self, url):
        self.url = url
        self.file = self.filename(url)
        
    def filename(self, url): 
        path=url

        path = re.sub("\W","_",path)
        path+=".html"
        return path
    
    def isForbidden(self):
        return 0;
    
    def isForbidden(self):
        return 0;
    
    def download(self):
        try:
            if True:
                retval = urlretrieve(self.url, self.file)
                javaGroupContent=JavaGroupContent.JavaGroupContent() 
                javaGroupContent.meet_page(self.url, self.file)
            else:
                retval = '*** INFO: no need to download '
        except IOError:
            retval = ('*** ERROR: invalid URL "%s"' % self.url,)
        return retval
        
    def parseAndGetLinks(self):
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        try:
            self.parser.feed(open(self.file).read())
            self.parser.close()
        except IOError:
            pass
        return self.parser.anchorlist
Exemplo n.º 54
0
class Retriever(object): 
	"""docstring for Retriever"""
	def __init__(self, url):#构造器,指向当前类的当前实例的引用。把URL字符串和从filename()返回的与之对应的文件名保存为本地属性。
		#super(Retriever, self).__init_url
		self.url = url
		self.file=self.filename(url)

	def filename(self,url,deffile='index.html'):
		parsedurl=urlparse(url,'http',0)   #定义网页的url ,下载协议方式,是否允许不完整的内容。urlparse分离出来的六个元素分别是(prot_shc,net_loc,path,params,query,frag).
		path=parsedurl[1]+parsedurl[2]  #http://csdn.net/name/articials/details/44444.html   组成文件路径
		ext=splitext(path)          #分离.前后,文件名与拓展名
		if ext[1]=='':#无文件,使用默认
			if path[-1]=='/':
				path+=deffile
			else:
				path+='/'+deffile    #加载页面路径
		ldir=dirname(path)    #提取path字符串的目录名称
		print ldir
		if sep!='/': #sep=='\'
			ldir=replace(ldir,'/',sep)
		if not isdir(ldir):
			if exists(ldir):unlink(ldir)
			makedirs(ldir)             #创建目录
		return path

	def download(self):  #下载页面
		try:
			cookie_support=urllib2.HTTPCookieProcessor(cookielib.CookieJar())
			self.opener=urllib2.build_opener(cookie_support,urllib2.HTTPHandler)
			urllib2.install_opener(self.opener)
			user_agent=[ 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
							'Opera/9.25 (Windows NT 5.1; U; en)',
							'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
							'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
							'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
							'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
							"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
							"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",]
 			agent=random.choice(user_agent)
			self.opener.addheaders=[("User-agent",agent),("Accept","*/*"),('Referer','http:www.google.com')]
			urll=self.opener.open(self.url)
			html=urll.read()
			output=open(self.file,'w')
			output.write(html)
			output.close()
			retval=self.url
			#print retval
			#retval=urlretrieve(self.url,self.file)
			return retval
		except IOError:
			retval=('*** error:invalid URl "%s"'%self.url,)
			return retval
		else:
			pass
		finally:
			pass

	def parseAndGetLinks(self):#分析页面获得url
		self.parser=HTMLParser(AbstractFormatter(DumbWriter(StringIO)))
		self.parser.feed(open(self.file).read())
		self.parser.close()
		return self.parser.anchorlist  #锚链接列表
Exemplo n.º 55
0
class Retriever(object):
    '''检索并解析每一个下载下来的web网页'''
    def __init__(self, url):
        self.url = url
        self.file = self.filename(url)

    def filename(self, url, deffile='index.html'):
        parsedurl = urlparse(url)
        #返回一个包含6个字符串项目的元组:协议、位置、路径、参数、查询、片段

        path = parsedurl.netloc + parsedurl.path
        ext = splitext(path)  #splitext搜索文件路径(path)和文件的扩展名
        # (ext),如a.png,ext=('a','png')

        if ext[1] == '':
            #no file ,use default
            #例如www.baidu.com -->www.baidu.com/index.html
            if path[-1] == '/':
                path += deffile
            else:
                path += '/' + deffile

        ldir = dirname(path)  #dirname 去掉文件名,返回目录
        if sep != '/':  #os.sep 主要用于系统路径中的分隔符
            ldir = replace(ldir, '/', sep)

        if isdir(ldir):
            #使用isdir辨别文件类型是不是目录。
            if exists(ldir):
                #unlink(ldir) #unlink 方法用于删除文件,如果文件是一个目录则返回一个错误。
                shutil.rmtree(ldir)
                print '目录 [%s] 已存在,删除目录....' % (ldir)

        print '创建目录 --> [%s]' % ldir
        makedirs(ldir)  #生成目录
        return path

    def download(self):  #下载网页
        try:
            retval = urlretrieve(self.url, self.file, callbackInfo)
            '''
            参数url:下载链接地址
            参数filename:指定了保存本地路径(如果参数未指定,urllib会生成一个临时文件保存数据。)
            参数reporthook:是一个回调函数,当连接上服务器、以及相应的数据块传输完毕时会触发该回调,我们可以利用这个回调函数来显示当前的下载进度。
            参数data:指post导服务器的数据,该方法返回一个包含两个元素的(filename, headers) 元组,filename 表示保存到本地的路径,header表示服务器的响应头
            :return filename, headers
            '''
        except IOError:
            retval = ('*** ERROR: invald URL "%s"' % (self.url))

        return retval

    def parseAndGetLinks(self):
        '''解析html页面,获取页面中的链接,并保存链接'''

        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        #使用HTMLParser的方法进行处理 , StringIO是从内存中读取数据,DumbWriter将事件流转换为存文本文档。
        self.parser.feed(open(self.file).read())
        #将self.file文件打开,并一次性读入上面定的文件中

        self.parser.close()
        print 'self.parser.anchorlist --> ', self.parser.anchorlist
        return self.parser.anchorlist  #anchorlist 记录href 地址
class Retriever(object):    # download web page
    def __init__(self,url):
        self.url = url
    self.file = self.filename(url)
 
    def filename(self,url,deffile='index.html'):
        parsedurl = urlparse(url,'http',0)    # parse path
    path = parsedurl[1] + parsedurl[2]
    ext = splitext(path)
    if ext[1] == '':    # no file,use default
        if path[-1] == '/':
            path += deffile
        else:
            path += '/' + deffile
    ldir = dirname(path)    # local directory
        if sep != '/':    # os-indep. path separator.
        ldir = repalce(ldir,'/',sep)
    if not isdir(ldir):    # create archieve dir if nec.
        if exists(ldir): unlink(ldir)
        makedirs(ldir)
    return path
 
    def download(self):    # download web page
        try:
        retval = urlretrieve(self.url,self.file)
    except IOError:
        retval = ('*** ERROR: invalid URL "%s"' % self.url)
    return retval
 
    def parseAndGetLinks(self):    # parse HTML, save links
        self.parser = HTMLParser(AbstractFormatter(\
        DumbWriter(StringIO())))
    self.parser.feed(open(self.file).read())
    self.parser.close()
    return self.parser.anchorlist
 
class Crawler(object):    # manage entire crawling process
     
    count = 0    # static download page counter
 
    def __init__(self,url):
        self.q = [url]
    self.seen = []
    self.dom = urlparse(url)[1]
 
    def getPage(self,url):
        r = Retriever(url)
    retval = r.download()
    if retval[0] == '*':    # error situation, do not parse
        print retval, '...skipping parse'
        return
    Crawler.count += 1
    print '\n(', Crawler.count, ')'
    print 'URL:',url
    print 'FILE:',retval[0]
    self.seen.append(url)
 
    links = r.parseAndGetLinks()    # get and process links
    for eachlink in links:
        if eachlink[:4] != 'http' and \
                find(eachlink,'://') == -1:
        eachlink = urljoin(url,eachlink)
        print '* ',eachlink
 
        if find(lower(eachlink),'mailto') != -1:
            print '...discarded, mailto link'
        continue
 
        if eachlink not in self.seen:
            if find(eachlink,self.dom) == -1:
            print '...discarded, not in domain'
            else:
                if eachlink not in self.q:
                self.q.append(eachlink)
                print '...new, added to Q'
            else:
                    print '...discarded, already in Q'
            else:
            print '...discarded, already processed'
 
    def go(self):    # process links in queue
        while self.q:
        url = self.q.pop()
        self.getPage(url)
 
def main():
    if len(argv) > 1:
    url = argv[1]
 
    else:
    try:
        url = raw_input('Enter starting URL: ')
    except (KeyboardInterrupt,EOFError):
        url = ''
    if not url: return
    robot = Crawler(url)
    robot.go()
 
if __name__ == '__main__':
    main()