def _set_data_(self, data, *args, **kwargs): if isinstance(data, QtGui.QTextDocument): self._docViewer_.setDocument(data) elif isinstance(data, str): from html.parser import HTMLParser parser = HTMLParser(convert_charrefs=True) parser.feed(data) parser.close() if parser.get_starttag_text() is None: self._docViewer_.document().setPlainText(data) else: self._docViewer_.document().setHtml(data) if data.find("<?xml version=") >= 0: self._highlighter_ = xmlutils.XmlSyntaxHighlighter( self._docViewer_.document()) else: self._highlighter_ = None else: raise TypeError( "Expecting a QTextDdocument or a str; got %s instead" % type(data).__name__) if kwargs.get("show", True): self.activateWindow()
def feed(self, data): """ :param data: Raw SAMI unicode string :returns: tuple (str, dict, set) """ no_cc = 'no closed captioning available' if '<html' in data.lower(): raise CaptionReadSyntaxError('SAMI File seems to be an HTML file.') elif no_cc in data.lower(): raise CaptionReadSyntaxError(f'SAMI File contains "{no_cc}"') # try to find style tag in SAMI try: # prevent BS4 error with huge SAMI files with unclosed tags index = data.lower().find("</head>") style = BeautifulSoup(data[:index], "lxml").find('style') if style and style.contents: self.styles = self._css_parse(' '.join(style.contents)) except AttributeError: self.styles = {} # fix erroneous italics tags data = data.replace('<i/>', '<i>') # fix awkward tags found in some SAMIs data = data.replace(';>', '>') HTMLParser.feed(self, data) # close any tags that remain in the queue while self.queue != deque([]): closing_tag = self.queue.pop() self.sami += f"</{closing_tag}>" return self.sami, self.styles, self.langs
def fuzz(buf): try: string = buf.decode("ascii") parser = HTMLParser() parser.feed(string) except UnicodeDecodeError: pass
def run(self, event_pack: EventPackage): random.seed(time.time()) #prepare the search terms searchTerms = event_pack.body searchTerms.pop(0) search = "sfw+"+"+".join(searchTerms) url = "https://www.google.com/search?tbm=isch&q="+search+"&oq="+search+"&gs_l=img&safesearch=on" #get the page headers = {} headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" req = urllib.request.Request(url,headers=headers) response = urllib.request.urlopen(req) text = response.read() #html parser parser = HTMLParser() theImages = [] def handleTag(tag, attrs): if tag == "img": for n in attrs: if n[0] == "data-src": #print(n[1]) theImages.append(str(n[1])) parser.handle_starttag = handleTag parser.feed(str(text)) nrimg = random.randint(0,len(theImages)) return theImages[nrimg]
def get_html_text(html: str): parser = HTMLParser() parser.text = "" parser.important_tag = True parser.feed(html) return parser.text.strip()
def feed(self, raw_data): assert isinstance(raw_data, str), "feed data must be unicode!" data = raw_data.strip() # cut out <pre> and <tt> areas block tag areas data = block_re.sub(self._pre_cut_out, data) data = inline_re.sub(self._pre_cut_out, data) # Delete whitespace from html code data = strip_html(data) if self.debugging: print("_" * 79) print("raw data:") print(repr(raw_data)) print(" -" * 40) print("cleaned data:") print(data) print("-" * 79) # print(clean_data.replace(">", ">\n")) # print("-"*79) HTMLParser.feed(self, data) return self.root
def _strip_tags(self, html): result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def feed(self, data, mode=0): self.mode = mode self.result = {} if not self.mode else [ ] #{название товара: ссылка на товар} или список производителей #with open(r'd:\temp\just\page.html', 'w') as fo: # fo.write(data) HTMLParser.feed(self, data)
def scrapeSubPages(_url, _depth, _superUrlSet=set()): # perform scraping given the url try: page = request.urlopen(_url) except Exception as e: # may catch a unauthorized error 401 print("ERROR! {}".format(e)) return scrapeParser = HTMLParser(_url) scrapeParser.feed(str(page.read())) # print(scrapeParser.scrapedEmails) # iterate through all newly found urls in this webpage if _depth != 0: print("NEW LEVEL URLS TO SEARCH THROUGH: {}".format( scrapeParser.scrapedURLs)) for newURL in list(scrapeParser.scrapedURLs): if newURL not in _superUrlSet: # print("url enumerated: " + newURL) # get new scraped emails and add to this specific parser's set _superUrlSet.add(newURL) newEmails = scrapeSubPages(newURL, _depth - 1, _superUrlSet) if newEmails: for e in newEmails: scrapeParser.scrapedEmails.add(e) return scrapeParser.scrapedEmails
def feed(self, data): """ Main method for purifying HTML (overrided) """ self.reset_purified() HTMLParser.feed(self, data) return self.html()
def parse_html_data(rootParser, htmlData): htmlParser = HTMLParser() root = rootParser(htmlParser, None, None, None) linedData = htmlData.split('\n') for line in linedData: htmlParser.feed(line.strip()) return root
def feed(self, data): data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data) data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data) data = data.replace(''', "'") data = data.replace('"', '"') HTMLParser.feed(self, data) HTMLParser.close(self)
def feed(self, data): # clear three and root self.__tree.clear() self.__root = None # new feed HP.feed(self, data) return self.__root
def parse_links(self): f = open(self.file, 'r') data = f.read() f.close() parse = HTMLParser() parse.feed(data) parse.close() return parse.anchorlist
def strip_html(text): if text is None: return '' parts = [] parser = HTMLParser() parser.handle_data = parts.append parser.feed(text) return ''.join(parts)
def feed(self, data, noskip=False): self.start_table = self.start_thead = self.start_td = self.start_tr = False self.tables = [] self.table = [] self.tr = [] self.data = '' self.noskip = noskip HTMLParser.feed(self, data)
def feed(self, data, noskip = False): self.start_table = self.start_thead = self.start_td = self.start_tr = False self.tables = [] self.table = [] self.tr = [] self.data = '' self.noskip = noskip HTMLParser.feed(self, data)
def feed(self, bytesdata): if bytesdata: if py3: super().feed(bytesdata.decode('latin1')) else: HTMLParser.feed(self, bytesdata.decode('latin1')) else: self.close()
def parse(site_urls: list) -> list: scripts = [] for site_url in site_urls: parser = ScriptParser(site_url) data = load_content(site_url) HTMLParser.feed(parser, data) scripts.extend(parser.scripts) return scripts
def feed(self, html): HTMLParser.feed(self, html) self.insideDataTd = False self.tdCount = -1 self.tableCount = -1 self.sizeFound = False self.seedsFound = False self.leechFound = False
def feed(self, txt): self.get_tags.feed(txt) tags = self.get_tags.pop() ints, tag2int = tags2ints(tags) self.match_map = max_tag_match(len(tag2int), ints) self.tag_idx = 0 HTMLParser.feed(self, txt)
def feed(self, data): lines = data.split('\n') c = 1 self.lineCount.append(0) for line in lines: self.lineCount.append(self.lineCount[c - 1] + len(line) + 1) c += 1 HTMLParser.feed(self, data)
def parse_links(self): f = open(self.file,'r') data = f.read() f.close() parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(io.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def strip_tags(self, htmlStr): htmlStr = htmlStr.strip() htmlStr = htmlStr.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(htmlStr) parser.close() return ''.join(result)
def parse_html(rootParser, htmlPath): htmlParser = HTMLParser() root = rootParser(htmlParser, None, None, None) with open(htmlPath, 'rb') as htmlFile: for line in htmlFile: htmlParser.feed(line.strip()) return root
def parse_links(self): 'Parse out the links found in downloaded HTML file' f = open(self.file, 'r') data = f.read() f.close() parser = HTMLParser(AbstractFormatter(DumbWriter(io.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def feed(self, bytesdata): if bytesdata: data = bytesdata.decode('latin1') if py3: super().feed(data) else: HTMLParser.feed(self, data) else: self.close()
def feed(self, html, list_of_lines): self.list_of_lines = list_of_lines self.curr_p_start = 0 self.curr_p_end = 0 self.currentLine = 0 self.text = html HTMLParser.feed(self, html)
def feed(self, data): """ """ self.struct.clear() HTMLParser.feed(self, data) return self.struct.outmost
def strip_tags(html): html = html.strip() html = html.strip("\n") result = [] parse = HTMLParser() parse.handle_data = result.append parse.feed(html) parse.close() return "".join(result)
def feed(self, data: str): """ Feed some data to the parser. Can be called multiple times and feeding must be terminated with a call to :meth:`.close`. :param data: A string containing HTML. """ HTMLParser.feed(self, data)
def strip_tags(html): from html.parser import HTMLParser html = html.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def test_generate_body_with_dummy_data_html(self): """Check to make sure that the last tag is an html tag""" test_email_data = [{'Author': 'Test Author', 'Journal': 'Test Journal', 'PubDate': datetime.datetime.now().date(), 'Title': 'Test Title', 'Link': 'https://www.altmetric.com/details/101571224'}] test_email_address = '*****@*****.**' test_body = api_parser.generate_body(test_email_data, 30, test_email_address) parser = HTMLParser() parser.feed(test_body) test_output = parser.get_starttag_text() parser.close() self.assertEqual(test_output, '<a href="mailto:[email protected]">')
def strip_tags(html): """ Python中过滤HTML标签的函数 """ html = html.strip() parser = HTMLParser() result = [] parser.handle_data = result.append parser.feed(html) parser.close() return result
def parse_links(self): 'Parse out the links found in downloaded HTML file' f = open(self.file, 'r') data = f.read() # print(data) f.close() # pa = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) pa = HTMLParser() pa.feed(data) pa.close() return pa.rawdata
def strip_tags(html): if html: html = html.strip() html = html.strip("\n") result = [] parse = HTMLParser() parse.handle_data = result.append parse.feed(html) parse.close() return "".join(result) return ''
def remove_html(text): text = re.sub("<[^<]+?>", "", text) text = text.replace("<", "<") text = text.replace(">", ">") return text s = HTMLParser() s.reset() s.reset() s.strict = False s.convert_charrefs = True s.fed = [] s.feed(text) return "".join(s.fed)
def remove_html(text): text = re.sub('<[^<]+?>', '', text) text = text.replace('<', '<'); text = text.replace('>', '>'); return text s = HTMLParser() s.reset() s.reset() s.strict = False s.convert_charrefs = True s.fed = [] s.feed(text) return ''.join(s.fed)
def feed(self, chunk: str) -> None: "Feed a given chunk of bytes to the parser" if not self.ok: return if self.message.parsed_headers.get('content-type', [None])[0] in self.link_parseable_types: try: if not isinstance(chunk, str): try: chunk = chunk.decode(self.message.character_encoding, 'ignore') except LookupError: pass HTMLParser.feed(self, chunk) except BadErrorIReallyMeanIt: pass except Exception as why: # oh, well... if self.err: self.err("feed problem: %s" % why) self.errors += 1 else: self.ok = False
def Verb_Conjugate(verb): verb = verb.strip().replace(" ","+").lower() #verb = verb.encode("unicode-escape") print(repr(verb)) address = "http://www.verbix.com/webverbix/German/{}.html".format(verb) #print(address) address = urllib.parse.urlsplit(address) address = list(address) address[2] = urllib.parse.quote(address[2]) address = urllib.parse.urlunsplit(address) #print(address) #address = repr(address)#.encode("unicode-escape") with urlopen(address) as website: # print(html.read()) html = deumlautify(website.read()).decode("utf8") #print(html) # print(type(html)) parser = HTMLParser() try: parser.feed(html) except: pass try: index = parser.data.index("Nominal Forms") index2 = parser.data.index("Verbs conjugated like") except: raise ValueError("Could not connect to Verbix or an invalid verb was passed in") data = reumlautify(parser.data[index:index2]) #print(data) indtenses = ["Present", "Perfect","Past","Pluperfect", "Future I","Future II"] contenses = ["Present", "Perfect"] verb_entry = Reorder(data, indtenses, contenses) return verb_entry
def feed(self, data): """ :param data: Raw SAMI unicode string :returns: tuple (unicode, dict, set) """ no_cc = 'no closed captioning available' if '<html' in data.lower(): raise CaptionReadSyntaxError( 'SAMI File seems to be an HTML file.') elif no_cc in data.lower(): raise CaptionReadSyntaxError('SAMI File contains "%s"' % no_cc) # try to find style tag in SAMI try: # prevent BS4 error with huge SAMI files with unclosed tags index = data.lower().find("</head>") self.styles = self._css_parse( BeautifulSoup(data[:index]).find('style').get_text()) except AttributeError: self.styles = {} # fix erroneous italics tags data = data.replace('<i/>', '<i>') # fix awkward tags found in some SAMIs data = data.replace(';>', '>') try: HTMLParser.feed(self, data) except HTMLParseError as e: raise CaptionReadSyntaxError(e) # close any tags that remain in the queue while self.queue != deque([]): closing_tag = self.queue.pop() self.sami += "</%s>" % closing_tag return self.sami, self.styles, self.langs
""" self.warningcolor = '\033[0;37;41m' self.tipcolor = '\033[0;31;42m' self.endcolor = '\033[0m' self._newcolor = '' @property def new(self): """ Customized Python Print Color. """ return self._newcolor @new.setter def new(self, color_str): """ New Color. """ self._newcolor = color_str def disable(self): """ Disable Color Print. """ self.warningcolor = '' self.endcolor = '' # TODO:(edony) Can not filter the needed infomation source_html = requests.get(r'https://www.python.org/events/python-events/') content = source_html.text p = HTMLParser() p.feed(content) print(p.handle_starttag('h3',['class']))
def feed(self, data): HTMLParser.feed(self, str(data)) if self.artist == None or self.title == None or self.license == None: raise Exception("Error parsing data from freesound!")
def feed(self, data): self.data = data HTMLParser.feed(self, data)
downloaded_file += chunk read_so_far += sys.getsizeof(chunk) #starts here print('looking up page') req = urllib.request.Request(page_address, None) f = urllib.request.urlopen(req) print('parsing page') parser = HTMLParser() parser.feed(f.read(2000).decode('utf-8')) print('page parsed') print('requesting file') ftp = ftplib.FTP(ftp_server) ftp.login() file_length = ftp.size(ftp_file_path) file_length_mb = file_length / 1024000 print('file: ' + file_name) print('file size: ' + str(file_length_mb) + 'MB') print('file size bytes: ' + str(file_length))
def _check_valid_html(text): p = HTMLParser() p.feed(text) p.close()
def feed(self, input_data): HTMLParser.feed(self, input_data) self.sanitize_dict()
def feed(self, data): if hasattr(self, "baseurl") and hasattr(self, "filepath"): return HTMLParser.feed(self, data) else: raise AcayipError("You have to fill in baseurl and filepath attrs first.")
def run_check(self, test): #try: p = HTMLParser() p.feed(str(test.resultBody)) p.close() return True
def feed(self, data): HTMLParser.feed(self, data) self.html = ''.join(self.html)
def feed(self, data): HTMLParser.feed(self, data) return self.root
def feed(self, *other): HTMLParser.feed(self, *other) if self.sig_count == None: raise ValueError("Could not parse the petition count from file '%s'" % (self.filepath))
def feed(self, data): data = data.replace("</' + 'script>", "</ignore>") HTMLParser.feed(self, data)
def feed(self, chars): # [8] if self.phase in [self.TERMINATED, self.FOUND]: self._terminate() return HTMLParser.feed(self, chars)
def get_hrefs(html_text): HTMLParser.handle_starttag = handle_starttag parser = HTMLParser() parser.hrefs = [] parser.feed(html_text) return parser.hrefs
def feed(self, data): HTMLParser.feed(self, data) return self.treestore
def feed(self, data): HTMLParser.feed(self, data) self.fixActionTimes()