def __init__(self, strict = False, reps = None, outs = None, sc = True): self.rep = reps self.outStream = outs self.stripComment = sc self.rep.parser = self HTMLParser.__init__(self, strict)
def __init__(self): HTMLParser.__init__(self) self.links = [] self.in_anchor = False self.attrs = None self.title = ''
def __init__(self, baseURL): HTMLParser.__init__(self) self.stack = [] self.anchors = set() self.links = [] self.baseURL = baseURL self.printed = False
def __init__(self): """An overload of the HTML Parser constructor. We use this initialization code to make sure that every variable is flushed. Arguments: self -- Allows the function to reference parent class properties. It is unnecessary to specify self during function calls as it is implied. """ # Initialize the HTML Parser. HTMLParser.__init__(self) # Initialize the variables. self._record_name = False self._record_meal = False self._record_station = False self._record_attributes = False self._day = EMPTY_STRING self._meal = EMPTY_STRING self._station = EMPTY_STRING self._name_text = [] self._station_text = [] self._attributes = [] # Hold all the dining hall menus. self.menu = []
def __init__(self, builder=None, encoding=None): self.__stack = [] if builder is None: builder = ElementTree.TreeBuilder() self.__builder = builder self.encoding = encoding or "iso-8859-1" HTMLParser.__init__(self)
def __init__(self): warnings.warn("portage.getbinpkg.ParseLinks is deprecated", DeprecationWarning, stacklevel=2) self.PL_anchors = [] html_parser_HTMLParser.__init__(self)
def __init__(self): HTMLParser.__init__(self) self.collect_data = False self.bound = 20 self.des_tag = "div" self.des_attr = ("id", "content") self.stations_info = None
def __init__(self): HTMLParser.__init__(self) self.state = State.NOWHERE self.data = "" self.pnpid = None self.company = None self.table = []
def __init__(self): HTMLParser.__init__(self) self.url = None self.params = {} self.in_form = False self.form_parsed = False self.method = "GET"
def __init__(self): HTMLParser.__init__(self) self.state = [] self.href= "" self.obj = {} self.index = {} self.done = False
def __init__(self): HTMLParser.__init__(self) self.data = dict() self.recordingAuthor = False self.recordingBody = False; self.data["body"] = "" self.save_tags = ['p', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5']
def gen_slogan(msg): html = get('http://www.sloganizer.net/en/outbound.php', params={'slogan': msg}) slogan = re.search('>(.*)<', html.text).group(1) parser = HTMLParser() slogan = parser.unescape(parser.unescape(slogan)) slogan = slogan.replace('\\', '').strip() return slogan if slogan else gen_slogan(msg)
def __init__(self,strict=False): # Constructor call of parent class. HTMLParser.__init__(self,strict) # Defining variables of this class. # There are 3 types of variables. # data_variables: these are the required information # data_check_variables: boolean values corresponding to each of the data_variables # to keep a check on the data been already extracted or not. # tag_check_variables: used for matching the proper format. self.h1=False self.desc=False self.description=' ' self.sol=False self.solution=' ' self.p=False; self.li=False; self.ref=False; self.references=' ' self.cvss=False; self.cvss_score=0.0; self.cve=False self.cve_id=' ' self.links=[] self.prod=False; self.products=[] self.last_h6=' ' self.h6=False self.h7=False self.clas=False self.attack_from=' ' self.attk=False self.impact=' ' self.impt=False
def __init__(self, args): HTMLParser.__init__(self) self.root_url = args.URL # Original URL passed. self.netloc = urllib.parse.urlparse(self.root_url).netloc # Netloc of the URL. self.depth = args.depth # Distance (pages) to travel. self.timer = args.time # Amount of time per page. self.db = MongoClient()[args.db][args.coll] # Database that stores data. self.sub = args.sub # Subdirectory to set as root of webpage. self.verbose = args.verbose # Verbosity setting. # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # self.key_terms = ["buy", "sell", "trade", "trading"] self.count = 0 # Amount of pages processed. self.posts = 0 # Amount of posts scanned. self.urlBlacklist = [] # Already completed URLS. self.urlDNU = [] # Do not use URLS, duplicates. self.urlList = [self.root_url] # List of URLS to scan. self.items = [] # Items to look for. self.discovered = {} # Items discovered + [URLs] self.BigDict = {} # Dictionary containing ThreadID + [URLS] <- urlDNU list?. # # # # # # # # # # # # # # # # # # # # # self.li_main = False # Start of play contribution self.blockquote_main = False # Start of the message self.div_quote_main = False # Start of Quote Container self.div_quote_xpand = False # Start of QuoteExpand self.blockquote_quote = False # Start of Quote Message self.text_lock = True # Locks the abilty to print text or use it. self.li_name = None # Name of original author self.blockquote_name = None # Name of person being quoted. # # # # # # # # # # # # # # # # # # # self.queryDB() # Loads the self.items list.
def handle_task(self, job): user = job.get('user', 'root') group = job.get('group', 'root') mail = job.get('sender', None) account = Account(user=user, group=group, mail=mail) recipients = job.get('recipients', None) subject = ensure_unicode(job.get('subject', '')) body = ensure_unicode(job.get('body', '')) attachments = job.get('attachments', None) smtp_host = job.get('smtp_host', 'localhost') smtp_port = job.get('smtp_port', 25) html = job.get('html', False) template_data = job.get('jobctx', {}) body = Template(body)(template_data) subject = Template(subject)(template_data) if not html: h = HTMLParser() body = h.unescape(body) subject = h.unescape(subject) # Execute the task return self.sendmail( account, recipients, subject, body, attachments, smtp_host, smtp_port, html)
def forwards(apps, schema_editor): html_parser = HTMLParser() for cascade_element in CascadeElement.objects.all(): if cascade_element.plugin_type != 'CarouselSlidePlugin': continue caption = cascade_element.glossary.get('caption') if not caption: continue text_element = add_plugin(cascade_element.placeholder, TextPlugin, cascade_element.language, target=cascade_element) old_body = html_parser.unescape(caption) new_body, count = _replace_text_body( old_body, input_pattern=r'<img ([^>]*)\bid="plugin_obj_(?P<pk>\d+)"([^>]*)/?>', output_tag='<cms-plugin {}></cms-plugin>', id_format='id="{}"', ) text_element.body = new_body text_element.save() # TODO: need to be re-tested if False and count > 0: for link_element in CMSPlugin.objects.filter(parent_id__in=(cascade_element.id, cascade_element.parent_id), plugin_type='TextLinkPlugin'): # print("Move Link {} from {} -> {}".format(link_element.id, link_element.parent_id, text_element.id)) link_element.move(text_element, pos='last-child') link_element.save()
def get_images(current_title, title, titles_length): h = HTMLParser() print("Fetching images from %s... (%s/%s)" % (title, current_title + 1, titles_length)) # Escape the title so we can create a valid link # title = title.replace('\'', '%27').replace(' ', '%20') # Repition is succes while True: try: page = urlopen(SOURCE_LOCATION % title).read().decode(ENCODING) break except IOError: print("\tServer's being lazy, retrying...") if not page: print("\tFailed to get %s's images!" % title) return [] # Ignore redirects if search("#DOORVERWIJZING", page, I | M) is not None or search("#REDIRECT.*", page, I | M) is not None: print("\tSkipping redirecting page %s" % title) return [] imagelinks = [] parser = ImageLocater(imagelinks) page = h.unescape(page) try: parser.feed(page) except: print("%s is a malformatted page" % title) return [] return imagelinks
def publishPost(self, post, link, comment): logging.info(" Publishing in Telegram...") bot = self.tc title = post content = comment links = "" channel = self.channel from html.parser import HTMLParser h = HTMLParser() title = h.unescape(title) text = '<a href="'+link+'">'+title+ "</a>\n" + content + '\n\n' + links textToPublish2 = "" if len(text) < 4090: textToPublish = text links = "" else: text = '<a href="'+link+'">'+title + "</a>\n" + content textToPublish = text[:4080] + ' ...' textToPublish2 = '... '+ text[4081:] logging.info("text to "+ textToPublish) logging.info("text to 2"+ textToPublish2) bot.sendMessage('@'+channel, textToPublish, parse_mode='HTML') if textToPublish2: try: bot.sendMessage('@'+channel, textToPublish2[:4090], parse_mode='HTML') except: bot.sendMessage('@'+channel, "Text is longer", parse_mode='HTML') if links: bot.sendMessage('@'+channel, links, parse_mode='HTML')
def feed(self, data): """ Main method for purifying HTML (overrided) """ self.reset_purified() HTMLParser.feed(self, data) return self.html()
def __init__(self): HTMLParser.__init__(self) self.convert_charrefs = False self.last = "starttag" self.in_pre = False self.output = "" self.last_tag = ""
def __init__(self, base_href): HTMLParser.__init__(self) self.base_href = base_href self.results = {} self.group_name = self.group_desc = None self.in_group_name = self.in_group_desc = self.in_activity = 0 self._clear_info()
def __init__(self, url): HTMLParser.__init__(self) self.url = url self.current_item = None self.save_data = None self.seeds_leech = False self.size_repl = re_compile(",")
def __init__(self, zip_file): HTMLParser.__init__(self) self._html = StringIO() # buffer for the processed HTML self._zip_file = zip_file # used to exclude the contents of script and object tags self._excl_nested_level = 0
def __init__(self): # use a list to store literal bytes and escaped Unicode if py3: super().__init__() else: HTMLParser.__init__(self) self.title = []
def parse_html_data(rootParser, htmlData): htmlParser = HTMLParser() root = rootParser(htmlParser, None, None, None) linedData = htmlData.split('\n') for line in linedData: htmlParser.feed(line.strip()) return root
def get_game_list (system): """List all the games on Guardiana for a given system.""" response = urllib.request.urlopen ("http://www.guardiana.net/MDG-Database/Complete-List/" + system + "/") doc = response.read () soup = BeautifulSoup(doc) html_game_list = soup.find("div", {"id": "MDGD_FullList_Box"}) game_list = re.findall ("""» <a href="(.+?)">(.+?)</a><br/>(?:<em>)?(.*?)(?:</em>)?<br/>""", str (html_game_list)) game_dict_list = [] for game in game_list: game_dict = {'url': "http://www.guardiana.net" + game[0], 'title': [ ]} # Clean up the URL and add it result = re.search ("(.*?)\?PHPSESSID=.*?", game[0]) if result: game_dict['url'] = "http://www.guardiana.net" + result.group(1) else: game_dict['url'] = "http://www.guardiana.net" + game[0] # Unescape the HTML entities from titles and add them pars = HTMLParser() game_dict['title'].append (pars.unescape (game[1])) game_dict_list.append (game_dict) return game_dict_list
def linksh(self, cli, ev): try: self.chancache[ev.target.lower()] except: return 1 if self.yt is True: yr = re.compile(".*(youtube\.com\/watch\?.*v=|youtu\.be\/)([A-Za-z" "0-9._%-]*)[&\w;=\+_\-]*.*") res = yr.search(ev.arguments[0]) if res is not None: self.ytlinks(cli, ev, res) return 0 url = re.compile("((https?):((\/\/)|(\\\\))+[\w\d:#@%\/;$()~_?\+-=\\\." "&]*)") res = url.search(ev.arguments[0]) if res is None: return 1 uri = res.group(1) r = urllib.request.urlopen(uri).read().decode('utf-8', 'replace') parser = HTMLParser() r = parser.unescape(r) yr = re.compile(".*<title[^>]*>([^<]+)</title>.*") title = yr.search(r) if title is None: return 1 cli.msg(ev.target, title.group(1))
def __init__(self): self.urlList = [] self.index = 0 self.nextUrl = '' self.tagList = ['li','a'] self.classList = ['photo-list-padding','pic'] HTMLParser.__init__(self)
def _parse_article(self, div): self.article = Article() parser = HTMLParser() for tag in div: if not hasattr(tag, 'name'): continue if tag.name == 'div' and self._tag_has_class(tag, 'gs_ri'): rt = tag.find('h3', {'class': 'gs_rt'}) if rt: ctu = rt.find('span') if ctu: ctu.extract() self.article['title'] = parser.unescape(''.join(rt.findAll(text=True)).strip()) if rt.a: self.article['url'] = self._path2url(rt.a['href']) if tag.find('div', {'class': 'gs_a'}): year = self.year_re.findall(tag.find('div', {'class': 'gs_a'}).text) self.article['year'] = year[0] if len(year) > 0 else None if tag.find('div', {'class': 'gs_fl'}): self._parse_links(tag.find('div', {'class': 'gs_fl'})) if tag.find('div', {'class': 'gs_rs'}): self.article['summary'] = tag.find('div', {'class': 'gs_rs'}).text if self.article['title']: self.handle_article(self.article)
def __init__(self, results, url, trackers): HTMLParser.__init__(self) self.results = results self.url = url self.trackers = trackers self.td_counter = None self.current_item = None
def __init__(self, html=None): HTMLParser.__init__(self) self.results = list() if html != None: self.feed(html)
def __init__(self): HTMLParser.__init__(self) self._level = 0 self._last = '' self._in_code = False self._prettified = [BASE_HTML_HEADER]
def __init__(self): HTMLParser.__init__(self) self.result = []
def __init__(self): HTMLParser.__init__(self) self._buf = [] self.hide_output = False
def __init__(self, logger=None, callback=None): HTMLParser.__init__(self) self.__logger = logger self.__stack = list() self.__data = list() self.__callback = callback
def __init__(self): HTMLParser.__init__(self) self.root = None self._stack = [] self._texting = None self._tailing = None
def __init__(self, **kw): HTMLParser.__init__(self, **kw) self._fed = []
def __init__(self): HTMLParser.__init__(self) self.a_flag = True self.elements = []
def __init__(self, allows=[]): HTMLParser.__init__(self) self.allow_tags = allows if allows else self.allow_tags self.result = [] self.start = [] self.data = []
def __init__(self, results, url, *args): HTMLParser.__init__(self) self.url = url self.td_counter = None self.current_item = None self.results = results
def __init__(self, base): HTMLParser.__init__(self) self._base = base self._resolved_url = None
def reset(self): HTMLParser.reset(self) self.entry = iter([])
def __init__(self): HTMLParser.__init__(self) self.inResult = False self.rolls = []
def __init__(self): self.buffering = False self.buf = '' self.dockerfile = '' HTMLParser.__init__(self)
def process_html(content): html_parser = HTMLParser() result = html_parser.unescape(content) return result
def __init__(self): HTMLParser.__init__(self) self._cur_tag = "" self.title = ""
def __init__(self): HTMLParser.__init__(self) self._wanted = False self._current_game = None self._games=[]
def __init__(self, htype): HTMLParser.__init__(self) self.result = [] self.this_llama = [] self.scan = False self.htype = htype
def __init__(self): HTMLParser.__init__(self) self.lname = '' self.notlogin = False self.result = []
def __init__(self): HTMLParser.__init__(self) self.first_image = None
def __init__(self, *args, **kwargs): HTMLParser.__init__(self, *args, **kwargs) self.elements = set()