Python HTMLParser.feed示例，html.parser.HTMLParser.feed Python示例

示例#1

0

显示文件

    def _set_data_(self, data, *args, **kwargs):
        if isinstance(data, QtGui.QTextDocument):
            self._docViewer_.setDocument(data)

        elif isinstance(data, str):
            from html.parser import HTMLParser

            parser = HTMLParser(convert_charrefs=True)

            parser.feed(data)

            parser.close()

            if parser.get_starttag_text() is None:
                self._docViewer_.document().setPlainText(data)

            else:
                self._docViewer_.document().setHtml(data)

            if data.find("<?xml version=") >= 0:
                self._highlighter_ = xmlutils.XmlSyntaxHighlighter(
                    self._docViewer_.document())
            else:
                self._highlighter_ = None

        else:
            raise TypeError(
                "Expecting a QTextDdocument or a str; got %s instead" %
                type(data).__name__)

        if kwargs.get("show", True):
            self.activateWindow()

示例#2

0

显示文件

文件： sami.py 项目： rlaphoenix/pycaption

    def feed(self, data):
        """
        :param data: Raw SAMI unicode string
        :returns: tuple (str, dict, set)
        """
        no_cc = 'no closed captioning available'

        if '<html' in data.lower():
            raise CaptionReadSyntaxError('SAMI File seems to be an HTML file.')
        elif no_cc in data.lower():
            raise CaptionReadSyntaxError(f'SAMI File contains "{no_cc}"')

        # try to find style tag in SAMI
        try:
            # prevent BS4 error with huge SAMI files with unclosed tags
            index = data.lower().find("</head>")
            style = BeautifulSoup(data[:index], "lxml").find('style')
            if style and style.contents:
                self.styles = self._css_parse(' '.join(style.contents))
        except AttributeError:
            self.styles = {}

        # fix erroneous italics tags
        data = data.replace('<i/>', '<i>')

        # fix awkward tags found in some SAMIs
        data = data.replace(';>', '>')
        HTMLParser.feed(self, data)

        # close any tags that remain in the queue
        while self.queue != deque([]):
            closing_tag = self.queue.pop()
            self.sami += f"</{closing_tag}>"

        return self.sami, self.styles, self.langs

示例#3

0

显示文件

def fuzz(buf):
    try:
        string = buf.decode("ascii")
        parser = HTMLParser()
        parser.feed(string)
    except UnicodeDecodeError:
        pass

示例#4

0

显示文件

文件： rimg.py 项目： chibiskuld/ccawmunity

    def run(self, event_pack: EventPackage):
        random.seed(time.time())

        #prepare the search terms
        searchTerms = event_pack.body
        searchTerms.pop(0)
        search = "sfw+"+"+".join(searchTerms)
        url = "https://www.google.com/search?tbm=isch&q="+search+"&oq="+search+"&gs_l=img&safesearch=on"

        #get the page
        headers = {}
        headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
        req = urllib.request.Request(url,headers=headers)
        response = urllib.request.urlopen(req)
        text = response.read()

        #html parser
        parser = HTMLParser()
        theImages = []
        def handleTag(tag, attrs):
            if tag == "img":
                for n in attrs:
                    if n[0] == "data-src":
                        #print(n[1])
                        theImages.append(str(n[1]))

        parser.handle_starttag = handleTag
        parser.feed(str(text))

        nrimg = random.randint(0,len(theImages))

        return theImages[nrimg]

示例#5

0

显示文件

def get_html_text(html: str):
    parser = HTMLParser()
    parser.text = ""
    parser.important_tag = True
    parser.feed(html)

    return parser.text.strip()

示例#6

0

显示文件

    def feed(self, raw_data):
        assert isinstance(raw_data, str), "feed data must be unicode!"
        data = raw_data.strip()

        # cut out <pre> and <tt> areas block tag areas
        data = block_re.sub(self._pre_cut_out, data)
        data = inline_re.sub(self._pre_cut_out, data)

        # Delete whitespace from html code
        data = strip_html(data)

        if self.debugging:
            print("_" * 79)
            print("raw data:")
            print(repr(raw_data))
            print(" -" * 40)
            print("cleaned data:")
            print(data)
            print("-" * 79)


#            print(clean_data.replace(">", ">\n"))
#            print("-"*79)

        HTMLParser.feed(self, data)

        return self.root

示例#7

0

显示文件

文件： plugin.py 项目： SpiderDave/spidey-supybot-plugins

 def _strip_tags(self, html):
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)

示例#8

0

显示文件

文件： Parsers.py 项目： plazmoid/nord-words

 def feed(self, data, mode=0):
     self.mode = mode
     self.result = {} if not self.mode else [
     ]  #{название товара: ссылка на товар} или список производителей
     #with open(r'd:\temp\just\page.html', 'w') as fo:
     #    fo.write(data)
     HTMLParser.feed(self, data)

示例#9

0

显示文件

文件： mail_scrape.py 项目： oalejel/Scioly-Mail-Scraper

def scrapeSubPages(_url, _depth, _superUrlSet=set()):
    # perform scraping given the url
    try:
        page = request.urlopen(_url)
    except Exception as e:
        # may catch a unauthorized error 401
        print("ERROR! {}".format(e))
        return
    scrapeParser = HTMLParser(_url)
    scrapeParser.feed(str(page.read()))
    # print(scrapeParser.scrapedEmails)

    # iterate through all newly found urls in this webpage
    if _depth != 0:
        print("NEW LEVEL URLS TO SEARCH THROUGH: {}".format(
            scrapeParser.scrapedURLs))
        for newURL in list(scrapeParser.scrapedURLs):
            if newURL not in _superUrlSet:
                # print("url enumerated: " + newURL)
                # get new scraped emails and add to this specific parser's set
                _superUrlSet.add(newURL)
                newEmails = scrapeSubPages(newURL, _depth - 1, _superUrlSet)
                if newEmails:
                    for e in newEmails:
                        scrapeParser.scrapedEmails.add(e)

    return scrapeParser.scrapedEmails

示例#10

0

显示文件

文件： purifier.py 项目： ilyutoev/python-html-purifier

 def feed(self, data):
     """
     Main method for purifying HTML (overrided)
     """
     self.reset_purified()
     HTMLParser.feed(self, data)
     return self.html()

示例#11

0

显示文件

文件： markup.py 项目： explosiveduck/ed2d

def parse_html_data(rootParser, htmlData):
    htmlParser = HTMLParser()
    root = rootParser(htmlParser, None, None, None)
    linedData = htmlData.split('\n')
    for line in linedData:
        htmlParser.feed(line.strip())
    return root

示例#12

0

显示文件

文件： pages.py 项目： msauria/galaxy

 def feed(self, data):
     data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
     data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
     data = data.replace('&#39;', "'")
     data = data.replace('&#34;', '"')
     HTMLParser.feed(self, data)
     HTMLParser.close(self)

示例#13

0

显示文件

 def _strip_tags(self, html):
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)

示例#14

0

显示文件

    def feed(self, data):
        # clear three and root
        self.__tree.clear()
        self.__root = None

        # new feed
        HP.feed(self, data)
        return self.__root

示例#15

0

显示文件

 def parse_links(self):
     f = open(self.file, 'r')
     data = f.read()
     f.close()
     parse = HTMLParser()
     parse.feed(data)
     parse.close()
     return parse.anchorlist

示例#16

0

显示文件

def strip_html(text):
    if text is None:
        return ''
    parts = []
    parser = HTMLParser()
    parser.handle_data = parts.append
    parser.feed(text)
    return ''.join(parts)

示例#17

0

显示文件

 def feed(self, data, noskip=False):
     self.start_table = self.start_thead = self.start_td = self.start_tr = False
     self.tables = []
     self.table = []
     self.tr = []
     self.data = ''
     self.noskip = noskip
     HTMLParser.feed(self, data)

示例#18

0

显示文件

文件： bnujwc.py 项目： xuhongxu96/BNU-Schoolwork-Assist

 def feed(self, data, noskip = False):
     self.start_table = self.start_thead = self.start_td = self.start_tr = False
     self.tables = []
     self.table = []
     self.tr = []
     self.data = ''
     self.noskip = noskip
     HTMLParser.feed(self, data)

示例#19

0

显示文件

文件： __init__.py 项目： coldnight/fetchtitle

 def feed(self, bytesdata):
   if bytesdata:
     if py3:
       super().feed(bytesdata.decode('latin1'))
     else:
       HTMLParser.feed(self, bytesdata.decode('latin1'))
   else:
     self.close()

示例#20

0

显示文件

文件： parser.py 项目： GUSAR1T0/ScriptDownloader

 def parse(site_urls: list) -> list:
     scripts = []
     for site_url in site_urls:
         parser = ScriptParser(site_url)
         data = load_content(site_url)
         HTMLParser.feed(parser, data)
         scripts.extend(parser.scripts)
     return scripts

示例#21

0

显示文件

文件： cpasbien.py 项目： Auska/docker-qBittorrent

 def feed(self, html):
     HTMLParser.feed(self, html)
     self.insideDataTd = False
     self.tdCount = -1
     self.tableCount = -1
     self.sizeFound = False
     self.seedsFound = False
     self.leechFound = False

示例#22

0

显示文件

    def feed(self, txt):
        self.get_tags.feed(txt)
        tags = self.get_tags.pop()

        ints, tag2int = tags2ints(tags)
        self.match_map = max_tag_match(len(tag2int), ints)
        self.tag_idx = 0
        HTMLParser.feed(self, txt)

示例#23

0

显示文件

 def feed(self, data):
     lines = data.split('\n')
     c = 1
     self.lineCount.append(0)
     for line in lines:
         self.lineCount.append(self.lineCount[c - 1] + len(line) + 1)
         c += 1
     HTMLParser.feed(self, data)

示例#24

0

显示文件

文件： 2-1.py 项目： yutty21/study

 def parse_links(self):
     f = open(self.file,'r')
     data = f.read()
     f.close()
     parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(io.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist

示例#25

0

显示文件

文件： read_cvs_to_xml.py 项目： GrandyLee/rayfire

 def strip_tags(self, htmlStr):
     htmlStr = htmlStr.strip()
     htmlStr = htmlStr.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(htmlStr)
     parser.close()
     return ''.join(result)

示例#26

0

显示文件

文件： markup.py 项目： explosiveduck/ed2d

def parse_html(rootParser, htmlPath):
    htmlParser = HTMLParser()
    root = rootParser(htmlParser, None, None, None)

    with open(htmlPath, 'rb') as htmlFile:
        for line in htmlFile:
            htmlParser.feed(line.strip())
    
    return root

示例#27

0

显示文件

文件： crawl3.py 项目： zhangwenzhen1/shejiyuan

 def parse_links(self):
     'Parse out the links found in downloaded HTML file'
     f = open(self.file, 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(AbstractFormatter(DumbWriter(io.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist

示例#28

0

显示文件

 def feed(self, bytesdata):
     if bytesdata:
         data = bytesdata.decode('latin1')
         if py3:
             super().feed(data)
         else:
             HTMLParser.feed(self, data)
     else:
         self.close()

示例#29

0

显示文件

    def feed(self, html, list_of_lines):
        self.list_of_lines = list_of_lines

        self.curr_p_start = 0
        self.curr_p_end = 0
        self.currentLine = 0
        self.text = html

        HTMLParser.feed(self, html)

示例#30

0

显示文件

    def feed(self, data):
        """

        """

        self.struct.clear()
        HTMLParser.feed(self, data)

        return self.struct.outmost

示例#31

0

显示文件

文件： ehp.py 项目： iogf/ehp

    def feed(self, data):
        """

        """

        self.struct.clear()
        HTMLParser.feed(self, data)

        return self.struct.outmost

示例#32

0

显示文件

文件： variables.py 项目： ddong8/ihasy

def strip_tags(html):
    html = html.strip()
    html = html.strip("\n")
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(html)
    parse.close()
    return "".join(result)

示例#33

0

显示文件

文件： extract.py 项目： living1069/libfnl

    def feed(self, data: str):
        """
        Feed some data to the parser.

        Can be called multiple times and feeding must be terminated with a
        call to :meth:`.close`.

        :param data: A string containing HTML.
        """
        HTMLParser.feed(self, data)

示例#34

0

显示文件

def strip_tags(html):
    from html.parser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)

示例#35

0

显示文件

文件： extract.py 项目： fnl/libfnl

    def feed(self, data: str):
        """
        Feed some data to the parser.

        Can be called multiple times and feeding must be terminated with a
        call to :meth:`.close`.

        :param data: A string containing HTML.
        """
        HTMLParser.feed(self, data)

示例#36

0

显示文件

文件： test_api_parser.py 项目： institute-of-behavioral-science/cu-altmetrics-report

 def test_generate_body_with_dummy_data_html(self):
     """Check to make sure that the last tag is an html tag"""
     test_email_data = [{'Author': 'Test Author', 'Journal': 'Test Journal', 'PubDate': datetime.datetime.now().date(), 'Title': 'Test Title', 'Link': 'https://www.altmetric.com/details/101571224'}]
     test_email_address = '*****@*****.**'
     test_body = api_parser.generate_body(test_email_data, 30, test_email_address)
     parser = HTMLParser()
     parser.feed(test_body)
     test_output = parser.get_starttag_text()
     parser.close()
     self.assertEqual(test_output, '<a href="mailto:[email protected]">')

示例#37

0

显示文件

文件： HTML标签处理.py 项目： fakeryuer/yuyi_self

def strip_tags(html):
    """
    Python中过滤HTML标签的函数
    """
    html = html.strip()
    parser = HTMLParser()
    result = []
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return result

示例#38

0

显示文件

文件： crawl.py 项目： shanJoy/core_python

 def parse_links(self):
     'Parse out the links found in downloaded HTML file'
     f = open(self.file, 'r')
     data = f.read()
     # print(data)
     f.close()
     # pa = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
     pa = HTMLParser()
     pa.feed(data)
     pa.close()
     return pa.rawdata

示例#39

0

显示文件

文件： helpers.py 项目： lansheng228/collipa

def strip_tags(html):
    if html:
        html = html.strip()
        html = html.strip("\n")
        result = []
        parse = HTMLParser()
        parse.handle_data = result.append
        parse.feed(html)
        parse.close()
        return "".join(result)
    return ''

示例#40

0

显示文件

文件： utils.py 项目： zhantyzgz/polaris

def remove_html(text):
    text = re.sub("<[^<]+?>", "", text)
    text = text.replace("&lt;", "<")
    text = text.replace("&gt;", ">")
    return text
    s = HTMLParser()
    s.reset()
    s.reset()
    s.strict = False
    s.convert_charrefs = True
    s.fed = []
    s.feed(text)
    return "".join(s.fed)

示例#41

0

显示文件

文件： utils.py 项目： luksireiku/polaris

def remove_html(text):
    text = re.sub('<[^<]+?>', '', text)
    text = text.replace('&lt;', '<');
    text = text.replace('&gt;', '>');
    return text
    s = HTMLParser()
    s.reset()
    s.reset()
    s.strict = False
    s.convert_charrefs = True
    s.fed = []
    s.feed(text)
    return ''.join(s.fed)

示例#42

0

显示文件

文件： link_parse.py 项目： optionalg/redbot

 def feed(self, chunk: str) -> None:
     "Feed a given chunk of bytes to the parser"
     if not self.ok:
         return
     if self.message.parsed_headers.get('content-type', [None])[0] in self.link_parseable_types:
         try:
             if not isinstance(chunk, str):
                 try:
                     chunk = chunk.decode(self.message.character_encoding, 'ignore')
                 except LookupError:
                     pass
             HTMLParser.feed(self, chunk)
         except BadErrorIReallyMeanIt:
             pass
         except Exception as why: # oh, well...
             if self.err:
                 self.err("feed problem: %s" % why)
             self.errors += 1
     else:
         self.ok = False

示例#43

0

显示文件

文件： Conjugate.py 项目： TheGoomy42/German-Verb-Conjugation-Quiz

def Verb_Conjugate(verb):
    verb = verb.strip().replace(" ","+").lower()
    #verb = verb.encode("unicode-escape")
    print(repr(verb))
    address = "http://www.verbix.com/webverbix/German/{}.html".format(verb)
    #print(address)
    address = urllib.parse.urlsplit(address)
    address = list(address)
    address[2] = urllib.parse.quote(address[2])
    address = urllib.parse.urlunsplit(address)
    #print(address)
    #address = repr(address)#.encode("unicode-escape")
    with urlopen(address) as website:

        # print(html.read())
        html = deumlautify(website.read()).decode("utf8")
        #print(html)

        # print(type(html))


    parser = HTMLParser()
    try:
        parser.feed(html)
    except:
        pass

    try:
        index = parser.data.index("Nominal Forms")
        index2 = parser.data.index("Verbs conjugated like")
    except:
        raise ValueError("Could not connect to Verbix or an invalid verb was passed in")


    data = reumlautify(parser.data[index:index2])
    #print(data)
    indtenses = ["Present", "Perfect","Past","Pluperfect", "Future I","Future II"]
    contenses = ["Present", "Perfect"]

    verb_entry = Reorder(data, indtenses, contenses)
    return verb_entry

示例#44

0

显示文件

文件： sami.py 项目： burakbostancioglu/pycaption

    def feed(self, data):
        """
        :param data: Raw SAMI unicode string
        :returns: tuple (unicode, dict, set)
        """
        no_cc = 'no closed captioning available'

        if '<html' in data.lower():
            raise CaptionReadSyntaxError(
                'SAMI File seems to be an HTML file.')
        elif no_cc in data.lower():
            raise CaptionReadSyntaxError('SAMI File contains "%s"' % no_cc)

        # try to find style tag in SAMI
        try:
            # prevent BS4 error with huge SAMI files with unclosed tags
            index = data.lower().find("</head>")

            self.styles = self._css_parse(
                BeautifulSoup(data[:index]).find('style').get_text())
        except AttributeError:
            self.styles = {}

        # fix erroneous italics tags
        data = data.replace('<i/>', '<i>')

        # fix awkward tags found in some SAMIs
        data = data.replace(';>', '>')
        try:
            HTMLParser.feed(self, data)
        except HTMLParseError as e:
            raise CaptionReadSyntaxError(e)

        # close any tags that remain in the queue
        while self.queue != deque([]):
            closing_tag = self.queue.pop()
            self.sami += "</%s>" % closing_tag

        return self.sami, self.styles, self.langs

示例#45

0

显示文件

文件： html_parse.py 项目： edonyM/pyexer

        """
        self.warningcolor = '\033[0;37;41m'
        self.tipcolor = '\033[0;31;42m'
        self.endcolor = '\033[0m'
        self._newcolor = ''
    @property
    def new(self):
        """
        Customized Python Print Color.
        """
        return self._newcolor
    @new.setter
    def new(self, color_str):
        """
        New Color.
        """
        self._newcolor = color_str
    def disable(self):
        """
        Disable Color Print.
        """
        self.warningcolor = ''
        self.endcolor = ''

# TODO:(edony) Can not filter the needed infomation
source_html = requests.get(r'https://www.python.org/events/python-events/')
content = source_html.text
p = HTMLParser()
p.feed(content)
print(p.handle_starttag('h3',['class']))

示例#46

0

显示文件

文件： freesound.py 项目： morevnaproject/RenderChan

 def feed(self, data):
     HTMLParser.feed(self, str(data))
     if self.artist == None or self.title == None or self.license == None:
         raise Exception("Error parsing data from freesound!")

示例#47

0

显示文件

文件： citation.py 项目： mattvonrocketstein/smash

 def feed(self, data):
     self.data = data
     HTMLParser.feed(self, data)

示例#48

0

显示文件

文件： __init__.py 项目： firefueled/DayZUpdater

    downloaded_file += chunk
    read_so_far += sys.getsizeof(chunk)
    
    
#starts here    
print('looking up page')

req = urllib.request.Request(page_address, None)
f = urllib.request.urlopen(req)      



print('parsing page')

parser = HTMLParser()
parser.feed(f.read(2000).decode('utf-8'))

print('page parsed')


print('requesting file')

ftp = ftplib.FTP(ftp_server)
ftp.login()

file_length = ftp.size(ftp_file_path)
file_length_mb = file_length / 1024000

print('file: ' + file_name)
print('file size: ' + str(file_length_mb) + 'MB')
print('file size bytes: ' + str(file_length))

示例#49

0

显示文件

文件： test_examples_load.py 项目： foolswood/questioneer

def _check_valid_html(text):
    p = HTMLParser()
    p.feed(text)
    p.close()

示例#50

0

显示文件

文件： parser.py 项目： studentkittens/flascat

 def feed(self, input_data):
     HTMLParser.feed(self, input_data)
     self.sanitize_dict()

示例#51

0

显示文件

文件： parsers.py 项目： yasar11732/Belge--ndir

 def feed(self, data):
     if hasattr(self, "baseurl") and hasattr(self, "filepath"):
         return HTMLParser.feed(self, data)
     else:
         raise AcayipError("You have to fill in baseurl and filepath attrs first.")

示例#52

0

显示文件

文件： condition.py 项目： BigSense/BigSenseTester

 def run_check(self, test):
     #try:
         p = HTMLParser()
         p.feed(str(test.resultBody))
         p.close()
         return True

示例#53

0

显示文件

文件： util.py 项目： ruspython/Recipes

 def feed(self, data):
     HTMLParser.feed(self, data)
     self.html = ''.join(self.html)

示例#54

0

显示文件

文件： naivehtmlparser.py 项目： Splawik/pytigon

 def feed(self, data):
     HTMLParser.feed(self, data)
     return self.root

示例#55

0

显示文件

文件： parse_petition.py 项目： Lokidottir/parse-petition

 def feed(self, *other):
     HTMLParser.feed(self, *other)
     if self.sig_count == None:
         raise ValueError("Could not parse the petition count from file '%s'" % (self.filepath))

示例#56

0

显示文件

文件： html2text.py 项目： sandeez/lino

 def feed(self, data):
     data = data.replace("</' + 'script>", "</ignore>")
     HTMLParser.feed(self, data)

示例#57

0

显示文件

文件： parsehtml.py 项目： Pyha/python-openid-py3.3

    def feed(self, chars):
        # [8]
        if self.phase in [self.TERMINATED, self.FOUND]:
            self._terminate()

        return HTMLParser.feed(self, chars)

示例#58

0

显示文件

文件： href.py 项目： maneyko/dotfiles

def get_hrefs(html_text):
    HTMLParser.handle_starttag = handle_starttag
    parser = HTMLParser()
    parser.hrefs = []
    parser.feed(html_text)
    return parser.hrefs

示例#59

0

显示文件

文件： main.py 项目： tommyshem/gtk3-icon-browser

 def feed(self, data):
     HTMLParser.feed(self, data)
     return self.treestore

示例#60

0

显示文件

文件： parser.py 项目： Noughmad/Isle

 def feed(self, data):
   HTMLParser.feed(self, data)
   self.fixActionTimes()