예제 #1
0
파일: Test.py 프로젝트: lgwjd0090/testgit
 def parseAndGetLinks(self):
     """StringIO是从内存中读取数据 DumbWriter将事件流转换为存文本文档  AbstractFormatter 类进行格式化
     """
     self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
     self.parser.feed(open(self.file).read())
     self.parser.close()
     return self.parser.anchorlist
예제 #2
0
 def OpenURL(self, url):
     from htmllib import HTMLParser
     import formatter
     self.url = url
     m = re.match('http://([^/]+)(/\S*)\s*', url)
     if m:
         host = m.groups()[0]
         path = m.groups()[1]
     else:
         m = re.match('http://(\S+)\s*', url)
         if not m:
             # Invalid URL
             self.logprint("Invalid or unsupported URL: %s" % (url))
             return
         host = m.groups()[0]
         path = ''
     f = self.RetrieveAsFile(host, path)
     if not f:
         self.logprint("Could not open %s" % (url))
         return
     self.logprint("Receiving data...")
     data = f.read()
     tmp = open('hangman_dict.txt', 'w')
     fmt = formatter.AbstractFormatter(formatter.DumbWriter(tmp))
     p = HTMLParser(fmt)
     self.logprint("Parsing data...")
     p.feed(data)
     p.close()
     tmp.close()
예제 #3
0
 def parseAndGetLinks(self):
     # 创建一个基本的HTML解释器,可能需要单独一篇文章来说这句
     self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
     # 解析html文件,获取所有的连接(带有href的)
     self.parser.feed(open(self.file).read())
     self.parser.close()
     return self.parser.anchorlist
예제 #4
0
 def parseAndGetLinks(self):  # pars HTML, save links
     self.parser = HTMLParser(AbstractFormatter( \
      DumbWriter(StringIO())))
     self.parser.feed(open(self.file).read())
     self.parser.close()
     print self.parser
     return self.parser.anchorlist
예제 #5
0
def create_plaintext_message(message):
    """ Create clean plain text version of email message

        Parse the html and remove style and javacript tags and then
        create a plain-text-message by parsing the html
        and attaching links as endnotes
    """
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.kill_tags = ['style']
    doc = message.decode('utf-8', 'ignore')
    to_clean = lxml.html.fromstring(doc)
    cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean))
    plain_text_maxcols = 72
    textout = cStringIO.StringIO()
    formtext = formatter.AbstractFormatter(
        formatter.DumbWriter(textout, plain_text_maxcols))
    parser = HTMLParser(formtext)
    parser.feed(cleaned_msg)
    parser.close()
    # append the anchorlist at the bottom of a message
    # to keep the message readable.
    counter = 0
    anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
    for item in parser.anchorlist:
        counter += 1
        if item.startswith('https://'):
            new_item = item.replace('https://', 'http://')
        else:
            new_item = item
        anchorlist += "[%d] %s\n" % (counter, new_item)
    text = textout.getvalue() + anchorlist
    del textout, formtext, parser, anchorlist
    return text
예제 #6
0
파일: textformat.py 프로젝트: mishas/robin
def html2text(htmldata):
    # patch htmldata
    htmldata = htmldata.replace("<br/>", "<br>")

    fmt = HTMLtoTextFormatter()
    prs = HTMLParser(fmt)
    prs.feed(htmldata)
    return fmt.getText()
예제 #7
0
 def parseAndGetLinks(self):
     self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
     try:
         self.parser.feed(open(self.file).read())
         self.parser.close()
     except IOError:
         pass
     return self.parser.anchorlist
예제 #8
0
def get_urls(url):
    data = urllib.urlopen(url).read()
    parser = HTMLParser(
        formatter.AbstractFormatter(formatter.DumbWriter(
            cStringIO.StringIO())))
    parser.feed(data)
    parser.close()
    url_list = parser.anchorlist
    return url_list
예제 #9
0
 def parse_links(self):
     'Parse out the links found in downloaded HTML file'
     f = open(self.file, 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
예제 #10
0
 def _clean_text(self, text):
     try:
         text = text.replace("&nbsp;", " ")
         text = text.strip()
         parser = HTMLParser(None)
         parser.save_bgn()
         parser.feed(text)
         return parser.save_end()
     except:
         return text
예제 #11
0
 def parse_links(self):
     f = open(self.file, 'r')
     data = f.read()
     f.close()
     paeser = HTMLParser(
         formatter.AbstractFormatter(
             formatter.DumbWriter(cStringIO.StringIO())))
     paeser.feed(data)
     paeser.close()
     return paeser.anchorlist
예제 #12
0
def html2text(html):
    f = StringIO()
    parser = HTMLParser(AbstractFormatter(DumbWriter(f)))
    try:
        parser.feed(html)
    except HTMLParseError:
        return ''
    else:
        parser.close()
        return f.getvalue()
예제 #13
0
def get_text_from_html( html_input ):
  "Strip tags and non-ascii characters from HTML input."
  my_stringio = StringIO.StringIO() # make an instance of this file-like string thing
  p = HTMLParser(AbstractFormatter(DumbWriter(my_stringio)))
  try: p.feed(html_input); p.close() #calling close is not usually needed, but let's play it safe
  except HTMLParseError: print '***HTML malformed***' #the html is badly malformed (or you found a bug)
  #return my_stringio.getvalue().replace('\xa0','')
  s = re.sub( r'[^\x00-\x7f]', r' ', my_stringio.getvalue() )
  s = s.replace('\r\n',' ').replace('\n',' ')
  s = re.sub( ' +', ' ', s )
  return s
예제 #14
0
    def parseAndGetLinks(self):
        '''解析html页面,获取页面中的链接,并保存链接'''

        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        #使用HTMLParser的方法进行处理 , StringIO是从内存中读取数据,DumbWriter将事件流转换为存文本文档。
        self.parser.feed(open(self.file).read())
        #将self.file文件打开,并一次性读入上面定的文件中

        self.parser.close()
        print 'self.parser.anchorlist --> ', self.parser.anchorlist
        return self.parser.anchorlist  #anchorlist 记录href 地址
예제 #15
0
 def parse_link(seld):
     'Parse out the link'
     f = open('seld.file', 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(
         formatter.AbstractFormatter(
             formatter.DumbWriter(cStringIO.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
예제 #16
0
 def parse_links(self):
     """fetch all links from page
     """
     f = open(self.save_file, 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(
         formatter.AbstractFormatter(
             formatter.DumbWriter(cStringIO.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
예제 #17
0
 def parseAndGetLinks(self, html_string):
     try:
         self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
         self.parser.feed(html_string)
         self.parser.close()
         links = []
         for eachLink in self.parser.anchorlist:
             if eachLink[:4] != "http" and find(eachLink, "://") == -1:
                 eachLink = urljoin(self.base_url, eachLink)
             links.append(eachLink)
         return links
     except IOError:
         return []
예제 #18
0
 def parse_html(self, html):
     from StringIO import StringIO
     from formatter import (AbstractFormatter, DumbWriter)
     from htmllib import HTMLParser
     _html = re.sub(self.notrans_tag, r" \1 ", html)
     buf = StringIO()
     p = HTMLParser(AbstractFormatter(DumbWriter(buf)))
     p.feed(_html)
     _sub = re.sub(self.whitespaces, " ", buf.getvalue())
     # FIXME: how can zerowidth be removed more simply?
     _sub = re.sub(self.zerowidth, "", _sub)
     _sub = re.sub(self.colon, r"\1", _sub)
     return _sub
예제 #19
0
def get_plain_from_html(html):
    """extract plain text from html

    >>> test_html = "<div><h1>Hey<h1><p>This is some text</p></div>"
    >>> get_plain_from_html(test_html)
    '\\nHey\\n\\nThis is some text'

    """
    from htmllib import HTMLParser  # import here to avoid high startup cost

    textout = StringIO()
    formtext = AbstractFormatter(DumbWriter(textout))
    parser = HTMLParser(formtext)
    parser.feed(html)
    parser.close()
    return textout.getvalue()
예제 #20
0
 def compactor(dev_filename, rel_filename):
     # Use compactor to generate release version.
     echo('Compacting: %s -> %s' % (dev_filename, rel_filename))
     source_data = open(dev_filename, 'r').read()
     try:
         # Verify that the html file is correct
         htmlparser = HTMLParser(NullFormatter())
         htmlparser.feed(source_data)
         htmlparser.close()
         # Now try to minify
         output_file = open(rel_filename, 'wb')
         compactor = HTMLMinifier(output_file.write, True)
         compactor.feed(source_data)
         compactor.close()
         output_file.close()
     except HTMLParseError as e:
         error(str(e))
         exit(1)
예제 #21
0
def collectURLSFromPage(page):
    """
    This returns a list of URLS that come from a certain page.
    Useful for spiders. It takes just a string as an argument.
    """

    resultList = []
    if page == "":
        #nothing to parse, so nothing to return
        return resultList

    #print "Doing form parser"
    if page.count("<form") > 0:
        otherlist = daveFormParse(page)
        for key in otherlist:
            resultList.append(key)
            pass

    #DEBUG
    #return resultList

    #print "Doing RAW Parser"
    spamList = rawParse(page)
    for key in spamList:
        resultList.append(key)
        pass

    #This needs to be documented somehow, but I have no idea what it does
    try:
        parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        parser.feed(page)
        parser.close()

    except:
        #print "DEBUG: Caught an exception trying to parse that html file."
        #print "(Not sure why this happens - you'll have to crawl this page manually)"
        return resultList

    #print "Adding HTML Parser data"
    for key in parser.anchorlist:
        resultList.append(key)
        pass

    return resultList
예제 #22
0
    def _create_plaintext_message(self, text):
        """ Create a plain-text-message by parsing the html
            and attaching links as endnotes
        """
        plain_text_maxcols = 72
        textout = cStringIO.StringIO()
        formtext = formatter.AbstractFormatter(
            formatter.DumbWriter(textout, plain_text_maxcols))
        parser = HTMLParser(formtext)
        parser.feed(text)
        parser.close()

        # append the anchorlist at the bottom of a message
        # to keep the message readable.
        anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n"
        for counter, item in enumerate(parser.anchorlist):
            anchorlist += "[{0:d}] {1:s}\n".format(counter, item)

        text = textout.getvalue() + anchorlist
        del textout, formtext, parser, anchorlist
        return text
예제 #23
0
파일: crawl.py 프로젝트: wengowl/gae
    def parseAndGetLinks(self):  # parse HTML, save links
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        r = self.download()
        if r:
            print '________'
            try:
                try:
                    s = r.read(50000)
                except socket.error as e:
                    print "***************************socket error***************************", e
                    return []
                self.parser.feed(s)
                print '------------------'

                r.close()
                print '***************************'
            except HTMLParseError:
                print 'get links error\n'
                return []

        self.parser.close()
        return self.parser.anchorlist
예제 #24
0
def collectURLSFromPage(page):

    resultList = []

    #print "Doing form parser"
    if page.count("<form") > 0:
        otherlist = daveFormParse(page)
        for key in otherlist:
            resultList.append(key)
            pass

    #DEBUG
    #return resultList

    #print "Doing RAW Parser"
    spamList = rawParse(page)
    for key in spamList:
        resultList.append(key)
        pass

    #the whole "AbstractFormater()" line is a bunch of crap I copied
    #That needs to be documented somehow, but I have no idea what it does
    try:
        parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        parser.feed(page)
        parser.close()

    except:
        #print "DEBUG: Caught an exception trying to parse that html file."
        #print "(Not sure why this happens - you'll have to crawl this page manually)"
        return resultList

    #print "Adding HTML Parser data"
    for key in parser.anchorlist:
        resultList.append(key)
        pass

    return resultList
예제 #25
0
#coding:utf-8

import urllib2
from htmllib import HTMLParser
from formatter import NullFormatter
import os
import re

url_name = "http://b.hatena.ne.jp/hotentry"
html_data = urllib2.urlopen(url_name)
parser = HTMLParser(NullFormatter())

try:
    parser.feed(html_data.read())
except TypeError:
    print "type error"

pat = re.compile("^http.*")
for link in parser.anchorlist:
    x = pat.search(link)
    if x is not None:
        print x.group(0)

예제 #26
0
파일: spider.py 프로젝트: Micats/SpiderWeb
	def parseAndGetLinks(self):#分析页面获得url
		self.parser=HTMLParser(AbstractFormatter(DumbWriter(StringIO)))
		self.parser.feed(open(self.file).read())
		self.parser.close()
		return self.parser.anchorlist  #锚链接列表
 def parseAndGetLinks(self):    # parse HTML, save links
     self.parser = HTMLParser(AbstractFormatter(\
     DumbWriter(StringIO())))
예제 #28
0
def unescape(data):
    p = HTMLParser(None)
    p.save_bgn()
    p.feed(data)
    return p.save_end()
	def parseAndGetLinks(self):
		self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
		self.parser.feed(urlopen(self.url).read())
		self.parser.close()
		return self.parser.anchorlist
예제 #30
0
def html2text(html):
    output = StringIO()
    writer = DumbWriter(output)
    p = HTMLParser(AbstractFormatter(writer))
    p.feed(toText(html))
    return toText(output.getvalue())