def _issue_to_dict(self, issue): parser = HTMLParser() if self.use_old_api: parser = HTMLParser() args = { 'project': self.project_settings['key'], 'summary': parser.unescape(issue.summary), 'description': parser.unescape(issue.description), 'issuetype': {'name': issue.type}, } args.update(self._get_custom_fields(issue)) try: support_user = models.SupportUser.objects.get(user=issue.caller) key = support_user.backend_id or issue.caller.email except models.SupportUser.DoesNotExist: key = issue.caller.email args[self.get_field_id_by_name(self.issue_settings['caller_field'])] = [{ "name": key, # will be equal to issue.caller.email for non-support users "key": key, }] return args args = { 'requestFieldValues': { 'summary': parser.unescape(issue.summary), 'description': parser.unescape(issue.description) } } support_customer = issue.caller.supportcustomer args['requestParticipants'] = [support_customer.backend_id] return args
def feed(self, string): try: HTMLParser.feed(self, string) except Exception as e: # pragma: no cover import traceback traceback.print_exc() self.out.write(string)
def convert_to_colour_list(cls, colours, *args, **kwargs): """ Takes a whole munge of nonsense input, converts it into a list of colours. Will split apart comma delimited strings. Will decode HTML chars. Will concatenate a mixture of comma strings and items """ colours = copy.deepcopy(colours) # Ensure we don't bugger up original if isinstance(colours, (str, unicode)): colours = [colours] # Listify colours.extend(args) intermediate_list = [] # Add in comma delimited stuff h = HTMLParser() for colour_term in colours: if isinstance(colour_term, (str, unicode)): colour_term_decoded = h.unescape( colour_term) # HTML char decode colour_terms_list = colour_term_decoded.split(",") intermediate_list.extend(colour_terms_list) else: intermediate_list.append(colour_term) # Now sanitise the list again output_list = [] for colour in intermediate_list: if isinstance(colour, (str, unicode)): colour_clean = colour.strip() output_list.append(colour) return output_list
def __init__(self, builder=None, encoding=None): self.__stack = [] if builder is None: builder = ElementTree.TreeBuilder() self.__builder = builder self.encoding = encoding or "iso-8859-1" HTMLParser.__init__(self)
def _prepare_message(self, message): # slack don't process html entities html_parser = HTMLParser() message = html_parser.unescape(message) # slack also don't render html itself message = strip_tags(message) return message
def __init__(self): HTMLParser.__init__(self) self.links = [] self.in_anchor = False self.attrs = None self.title = ''
def __init__(self, *args, **kwargs): if sys.version_info > (3,4): #pragma: no cover HTMLParser.__init__(self, convert_charrefs=False) else: #pragma: no cover HTMLParser.__init__(self) super(HTMLRewriter, self).__init__(*args, **kwargs)
def _extract_programs(html, channel): """ Extract Programs from HTML code """ parser = HTMLParser() # Item regexes regex_item = re.compile( r'<a[^>]+?href="(?P<path>[^"]+)"[^>]+?>' r'.*?<h3 class="poster-teaser__title"><span>(?P<title>[^<]*)</span></h3>.*?' r'</a>', re.DOTALL) # Extract items programs = [] for item in regex_item.finditer(html): path = item.group('path') if path.startswith('/video'): continue title = parser.unescape(item.group('title')) # Program programs.append( Program( path=path.lstrip('/'), channel=channel, title=title, )) return programs
def get_hot_bills(self): """ Get list of most viewed bills from last week @return: list of dicts of the form: { 'congress': which # congress, 'number': bill #, 'title': short text, } """ soup = self._get("Most-Viewed+Bills") table = soup.find("table", class_="confluenceTable") if table: to_ret = [] rows = table.findAll("tr") h = HTMLParser() for row in rows: bills = {} columns = row.findAll("td") if columns and len(columns) == 3: bill = {} bill['congress'] = re.search( r"\[(\d+)\w+\]", columns[1].contents[1]).groups()[0] bill['congress'] = int(bill['congress']) bill['number'] = columns[1].find("a").contents[0].strip() bill['title'] = h.unescape( re.sub(r"\"", "", columns[2].contents[0])) to_ret.append(bill) return to_ret
def feed(self, data): data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data) data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data) data = data.replace(''', "'") data = data.replace('"', '"') HTMLParser.feed(self, data) HTMLParser.close(self)
def extended_stats(self, user=None): if not user: data = self.api.me() else: if isinstance(user, str): data = self.api.get_user('%s' % str(user.replace('@', ''))) else: raise InvalidParameter logging.info("[*] Created: %s" % data.created_at) logging.info("[*] Description: %s" % data.description) logging.info("[*] Last update: %s" % data.status.created_at) hashtags = ' '.join( [ "#%s" % x['text'] for x in \ data.status.entities['hashtags']] ) mentions = ' '.join( [ "@%s" % x['screen_name'] for x in \ data.status.entities['user_mentions']] ) logging.info("[*] \tUser Mentions: %s" % mentions) logging.info("[*] \tHashtags: %s" % hashtags) html = HTMLParser() if "RT @" in data.status.text: logging.info( "[*] \tRetweet Text: %s" % html.unescape(data.status.text.replace('\n', '\n\t\t '))) else: logging.info( "[*] \tTweet Text: %s" % html.unescape(data.status.text.replace('\n', '\n\t\t '))) logging.info('[*] \tRetweet Count: %s' % str(data.status.retweet_count))
def strip_tags(string, allowed_tags=''): if allowed_tags != '': # Get a list of all allowed tag names. allowed_tags_list = re.sub(r'[\\/<> ]+', '', allowed_tags).split(',') allowed_pattern = '' for s in allowed_tags_list: if s == '': continue; # Add all possible patterns for this tag to the regex. if allowed_pattern != '': allowed_pattern += '|' allowed_pattern += '<' + s + ' [^><]*>$|<' + s + '>|' # Get all tags included in the string. all_tags = re.findall(r'<]+>', string, re.I) for tag in all_tags: # If not allowed, replace it. if not re.match(allowed_pattern, tag, re.I): string = string.replace(tag, '') else: # If no allowed tags, remove all. string = re.sub(r'<[^>]*?>', '', string) h = HTMLParser() string = h.unescape(string) return string
def parse_denied_courses(school_html): root = fromstring(school_html) denied_table = root.cssselect('#NcaaCrs_DeniedCategory_All') courses = [] for tr in denied_table[0].cssselect('tr')[1:]: tables = tr.cssselect('table') try: subject = tables[0].cssselect('.hs_tableHeader')[0].text_content() except IndexError: continue for course_tr in tables[1].cssselect('tbody tr'): course = {} tds = course_tr.cssselect('td') course['subject'] = subject course['course_weight'] = tds[0].text_content().strip() h = HTMLParser() course['title'] = h.unescape(tds[1].text_content().strip()) course['notes'] = tds[2].text_content().strip() course['max_credits'] = tds[3].text_content().strip() course['ok_through'] = tds[4].text_content().strip() course['reason_code'] = tds[5].text_content().strip() course['disability_course'] = tds[6].text_content().strip() courses.append(course) return courses
def get_link(url): if 'apitvh.net' in url \ or 'tvhayz.net' in url \ or 'tvhays.org' in url \ or 'tvhai.org' in url \ : url = re.search(r'\?link=(.*)', url).group(1) response = Request().get(url) m = re.search('data-options="(.+?)"', response) h = HTMLParser() try: s = m.group(1) except: raise Exception("Link has been removed") s = h.unescape(s) s = json.loads(s) s = json.loads(s['flashvars']['metadata']) items = [(i['url'], rsl(i['name'])) for i in s['videos']] items = sorted(items, key=lambda elem: int(elem[1]), reverse=True) if len(items) == 1: return items[0] listitems = [] for i in items: listitems.append("%s (%s)" % (i[1], i[0])) index = xbmcgui.Dialog().select("Select ok.ru stream", listitems) if index == -1: return None, None else: return items[index]
def __init__(self, *args, **kwargs): if sys.version_info > (3, 4): #pragma: no cover HTMLParser.__init__(self, convert_charrefs=False) else: #pragma: no cover HTMLParser.__init__(self) super(HTMLRewriter, self).__init__(*args, **kwargs)
def _issue_to_dict(self, issue): """ Convert issue to dict that can be accepted by JIRA as input parameters """ caller = issue.caller.full_name or issue.caller.username parser = HTMLParser() args = { 'project': self.project_settings['key'], 'summary': parser.unescape(issue.summary), 'description': parser.unescape(issue.description), 'issuetype': { 'name': issue.type }, self._get_field_id_by_name(self.issue_settings['caller_field']): caller, } if issue.reporter: args[self._get_field_id_by_name( self.issue_settings['reporter_field'])] = issue.reporter.name if issue.impact: args[self._get_field_id_by_name( self.issue_settings['impact_field'])] = issue.impact if issue.priority: args['priority'] = {'name': issue.priority} return args
def __init__(self, encoding='iso8859-1'): HTMLParser.__init__(self) self.encoding = encoding self.tagstack = [] self.checkflag = 0 # Are we in a tag we check? self.inbody = 0 self.__data = []
def display_link_prompt(args, urls, titles): """Print URLs and their descriptions alongside a prompt. Keyword arguments: args -- program arguments (dict) urls -- search URLs found (list) titles -- descriptions of search URLs found (list) """ while 1: print('\n{0}'.format(BORDER)) for i in range(len(urls)): link = HTMLParser().unescape(titles[i]) print('{0}. {1}'.format(i+1, link.encode('utf-8') if PY2 else link)) print(BORDER) # Handle link prompt input try: link_input = [inp.strip() for inp in input(': ').split()] if not link_input: continue utils.check_input(link_input) # Check input in case of quit print('\n') exec_prompt_cmd(args, urls, link_input[0], link_input[1:]) except (KeyboardInterrupt, EOFError, ValueError, IndexError): return False
def __init__(self, url): HTMLParser.__init__(self) if url[-1] != '/': url += '/' self.__url = url self.links = set()
def __init__(self, url, session=None, authentication=None, timeout=None): """Create instance of a directory parser. :param url: url of the directory on the web server. :param session: a requests Session instance used to fetch the directory content. If None, a new session will be created. :param authentication: a tuple (username, password) to authenticate against the web server, or None for no authentication. Note that it will only be used if the given *session* is None. :param timeout: timeout in seconds used when fetching the directory content. """ if not session: session = requests.Session() session.auth = authentication self.session = session self.timeout = timeout self.active_url = None self.entries = [] HTMLParser.__init__(self) # Force the server to not send cached content headers = {'Cache-Control': 'max-age=0'} r = self.session.get(url, headers=headers, timeout=self.timeout) try: r.raise_for_status() self.feed(r.text) finally: r.close()
def get_programs(self, channel): """ Get a list of all programs of the specified channel. :type channel: str :rtype list[Program] NOTE: This function doesn't use an API. """ if channel not in CHANNELS: raise Exception('Unknown channel %s' % channel) # Load webpage data = self._get_url(CHANNELS[channel]['url']) # Parse programs h = HTMLParser() regex_programs = re.compile( r'<a class="program-overview__link" href="(?P<path>[^"]+)">\s+' r'<span class="program-overview__title">\s+(?P<title>[^<]+)</span>.*?' r'</a>', re.DOTALL) programs = [ Program(channel=channel, path=program.group('path').lstrip('/'), title=h.unescape(program.group('title').strip())) for program in regex_programs.finditer(data) ] return programs
def _get_event(): event = [e for e in session['events'] if e['id'] == request.args.get('event')] if event: h = HTMLParser() event[0]['description'] = h.unescape(event[0]['description']) return jsonify(event[0]) return ''
def __init__(self): if is_py3(): HTMLParser.__init__(self, convert_charrefs=True) else: HTMLParser.__init__(self) self._output = ''
def get_episode(self, channel, path): """ Get a Episode object from the specified page. :type channel: str :type path: str :rtype Episode NOTE: This function doesn't use an API. """ if channel not in CHANNELS: raise Exception('Unknown channel %s' % channel) # Load webpage page = self._get_url(CHANNELS[channel]['url'] + '/' + path) # Extract program JSON parser = HTMLParser() regex_program = re.compile(r'data-hero="([^"]+)', re.DOTALL) json_data = parser.unescape(regex_program.search(page).group(1)) data = json.loads(json_data)['data'] program = self._parse_program_data(data) # Extract episode JSON regex_episode = re.compile( r'<script type="application/json" data-drupal-selector="drupal-settings-json">(.*?)</script>', re.DOTALL) json_data = parser.unescape(regex_episode.search(page).group(1)) data = json.loads(json_data) # Lookup the episode in the program JSON based on the nodeId # The episode we just found doesn't contain all information for episode in program.episodes: if episode.nodeid == data['pageInfo']['nodeId']: return episode return None
def cmd_genpot(config, options): """Generate the gettext pot file""" os.chdir(config.source_dir) po_path = os.path.join(config.source_dir, 'po') if not os.path.isdir(po_path): os.mkdir(po_path) python_files = [] for root, dirs_dummy, files in os.walk(config.source_dir): for file_name in files: if file_name.endswith('.py'): file_path = os.path.relpath(os.path.join(root, file_name), config.source_dir) python_files.append(file_path) python_files.sort() # First write out a stub .pot file containing just the translated # activity name, then have xgettext merge the rest of the # translations into that. (We can't just append the activity name # to the end of the .pot file afterwards, because that might # create a duplicate msgid.) pot_file = os.path.join('po', '%s.pot' % config.bundle_name) escaped_name = _po_escape(config.activity_name) f = open(pot_file, 'w') f.write('#: activity/activity.info:2\n') f.write('msgid "%s"\n' % escaped_name) f.write('msgstr ""\n') if config.summary is not None: escaped_summary = _po_escape(config.summary) f.write('#: activity/activity.info:3\n') f.write('msgid "%s"\n' % escaped_summary) f.write('msgstr ""\n') if config.description is not None: parser = HTMLParser() strings = [] parser.handle_data = strings.append parser.feed(config.description) for s in strings: s = s.strip() if s: f.write('#: activity/activity.info:4\n') f.write('msgid "%s"\n' % _po_escape(s)) f.write('msgstr ""\n') f.close() args = [ 'xgettext', '--join-existing', '--language=Python', '--keyword=_', '--add-comments=TRANS:', '--output=%s' % pot_file ] args += python_files retcode = subprocess.call(args) if retcode: print('ERROR - xgettext failed with return code %i.' % retcode)
def original_unescape(self, s): """Since we need to use this sometimes""" if isinstance(s, basestring): return unicode(HTMLParser.unescape(self, s)) elif isinstance(s, list): return [unicode(HTMLParser.unescape(self, item)) for item in s] else: return s
def __init__(self, allows=None): HTMLParser.__init__(self) if allows is None: allows = [] self.allow_tags = allows if allows else self.allow_tags self.result = [] self.start = [] self.data = []
def __init__(self, style, styles = None): HTMLParser.__init__(self) self.s = '' self.style = style self.styles = styles if styles else default_styles self.style_stack = []
def __init__(self, styled, styles=None): HTMLParser.__init__(self) self.s = '' self.styled = styled self.styles = styles if styles else default_styles self.style_stack = []
def __init__(self, bot: "Bot"): self.bot = bot self.translate_client = translate.Client() # _http=self.bot.http) self.h = HTMLParser() self._spam_check = defaultdict(SpamChecker) # channel_id: list self.chat_history = defaultdict(lambda: [])
def __init__(self): HTMLParser.__init__(self) self._ignore = False self._ignorePath = None self._lasttag = None self._depth = 0 self.depthText = {} # path:text self.counting = 0 self.lastN = 0
def __init__(self, media_locator, link_handler): HTMLParser.__init__(self) self.handlers_start = StartRules(media_locator, link_handler) self.handlers_startend = StartEndRules(media_locator, link_handler) self.handlers_end = EndRules() self.new_buffer() self.stack = deque() self.stack.append([])
def __init__(self, max_words): # In Python 2, HTMLParser is not a new-style class, # hence super() cannot be used. HTMLParser.__init__(self) self.max_words = max_words self.words_found = 0 self.open_tags = [] self.truncate_at = None
def feed(self, *args, **kwargs): try: # With Python 2, super() cannot be used. # See the comment for __init__(). HTMLParser.feed(self, *args, **kwargs) except self.TruncationCompleted as exc: self.truncate_at = exc.truncate_at else: self.truncate_at = None
def __init__(self, _file, search_tag): if six.PY3: super(TemplateParser, self).__init__() else: # HTMLParser is not a new-style class in py2 HTMLParser.__init__(self) self.search_tag = search_tag self.file = _file self.parsed_data = []
def replaceHTMLCodes(txt): txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt) txt = HTMLParser().unescape(txt) txt = txt.replace(""", "\"") txt = txt.replace("&", "&") txt = txt.replace("&", "&") txt = txt.replace(" ", "") return txt
def __init__(self): HTMLParser.__init__(self) self.text_name = None self.original_value = None self.new_value = None self.in_tag = False self.read_buffer = six.StringIO()
def cmd_genpot(config, options): """Generate the gettext pot file""" os.chdir(config.source_dir) po_path = os.path.join(config.source_dir, 'po') if not os.path.isdir(po_path): os.mkdir(po_path) python_files = [] for root, dirs_dummy, files in os.walk(config.source_dir): for file_name in files: if file_name.endswith('.py'): file_path = os.path.relpath(os.path.join(root, file_name), config.source_dir) python_files.append(file_path) python_files.sort() # First write out a stub .pot file containing just the translated # activity name, then have xgettext merge the rest of the # translations into that. (We can't just append the activity name # to the end of the .pot file afterwards, because that might # create a duplicate msgid.) pot_file = os.path.join('po', '%s.pot' % config.bundle_name) escaped_name = _po_escape(config.activity_name) f = open(pot_file, 'w') f.write('#: activity/activity.info:2\n') f.write('msgid "%s"\n' % escaped_name) f.write('msgstr ""\n') if config.summary is not None: escaped_summary = _po_escape(config.summary) f.write('#: activity/activity.info:3\n') f.write('msgid "%s"\n' % escaped_summary) f.write('msgstr ""\n') if config.description is not None: parser = HTMLParser() strings = [] parser.handle_data = strings.append parser.feed(config.description) for s in strings: s = s.strip() if s: f.write('#: activity/activity.info:4\n') f.write('msgid "%s"\n' % _po_escape(s)) f.write('msgstr ""\n') f.close() args = ['xgettext', '--join-existing', '--language=Python', '--keyword=_', '--add-comments=TRANS:', '--output=%s' % pot_file] args += python_files retcode = subprocess.call(args) if retcode: print('ERROR - xgettext failed with return code %i.' % retcode)
def logged_in(self, y): if all([None is y or 'logout' in y, bool(filter(lambda c: 'remember_web_' in c, self.session.cookies.keys()))]): if None is not y: self.shows = dict(re.findall('<option value="(\d+)">(.*?)</option>', y)) h = HTMLParser() for k, v in self.shows.items(): self.shows[k] = sanitizeSceneName(h.unescape(unidecode(v.decode('utf-8')))) return True return False
def add_set(self, title, description, index=-1): widget = QtWidgets.QCheckBox(title.replace('&', '&&')) if description: h = HTMLParser() widget.setToolTip(h.unescape(description)) if index >= 0: self.sets_widget.layout().insertWidget(index, widget) else: self.sets_widget.layout().addWidget(widget) return widget
def __init__(self, *args, **kwargs): if sys.version_info > (3,): super(AnchorParser, self).__init__(*args, **kwargs) else: # pragma: no cover # HTMLParser is still an old style object and so super doesn't # work HTMLParser.__init__(self, *args, **kwargs) self.capture = 0 self.url = '' self.text = ''
def check_bz_bug(b): ''' Return status of a bug in BZ''' html = get_html(b) if html: text = html.content.decode('utf-8') name = TITLE.search(text).group(1) if TITLE.search(text) else '' h = HTMLParser() name = h.unescape(name) else: name = '' return name, None
def get_formatted_value(value, field): '''Prepare field from raw data''' from six.moves.html_parser import HTMLParser if(getattr(field, 'fieldtype', None) in ["Text", "Text Editor"]): h = HTMLParser() value = h.unescape(value) value = (re.subn(r'<[\s]*(script|style).*?</\1>(?s)', '', text_type(value))[0]) value = ' '.join(value.split()) return field.label + " : " + strip_html_tags(text_type(value))
def get_email_subject(self): """ WARNING: It is MANDATORY to override method if you are going to send email using the `send_notification_email` method. Your class must define an `email_subject_tmpl` attribute containing a template path to a file that has your email subject. """ # Convert the html back to plaintext after rendering it using template # to get rid of html ampersand character codes parser = HTMLParser() html_email = self._get_email_field('email_subject_tmpl', 'get_email_subject') return parser.unescape(html_email)
def __init__(self): HTMLParser.__init__(self) self._encoding = 'ISO-8859-1' self._handlers = {'table_start' : self.table_start, 'table_end' : self.table_end, 'tr_start' : self.tr_start, 'tr_end' : self.tr_end, 'td_start' : self.td_start, 'td_end' : self.td_end, 'th_start' : self.td_start, 'th_end' : self.td_end, 'br_start' : self.br_start, 'meta_start' : self.meta_start}
def __init__(self, tag="a", attr="href", process=None, unique=False): HTMLParser.__init__(self) warnings.warn( "HtmlParserLinkExtractor is deprecated and will be removed in " "future releases. Please use scrapy.linkextractors.LinkExtractor", ScrapyDeprecationWarning, stacklevel=2, ) self.scan_tag = tag if callable(tag) else lambda t: t == tag self.scan_attr = attr if callable(attr) else lambda a: a == attr self.process_attr = process if callable(process) else lambda v: v self.unique = unique
def __init__(self, max_words): # In Python 2, HTMLParser is not a new-style class, # hence super() cannot be used. try: HTMLParser.__init__(self, convert_charrefs=False) except TypeError: # pre Python 3.3 HTMLParser.__init__(self) self.max_words = max_words self.words_found = 0 self.open_tags = [] self.last_word_end = None self.truncate_at = None
def replace_html_entities(xml_bin_str): """XML does not contain entity references for many HTML entities, yet the Federal Register XML sometimes contains the HTML entities. Replace them here, lest we throw off XML parsing""" parser = HTMLParser() match = HTML_RE.search(xml_bin_str) while match: match_bin = match.group(0) match_str = match_bin.decode('utf-8') replacement = parser.unescape(match_str).encode('UTF-8') logger.debug("Replacing %s with %s in retrieved XML", match_str, replacement) xml_bin_str = xml_bin_str.replace(match_bin, replacement) match = HTML_RE.search(xml_bin_str) return xml_bin_str
def parse_endtag(self, i): # This is necessary because the underlying HTMLParser is buggy and # unreliable. try: return HTMLParser.parse_endtag(self, i) except AttributeError: return -1
def __init__(self, styled): HTMLParser.__init__(self) self.s = '' self.styled = styled self.styles = { 'err': MyHTMLParser.term.red, 'ref': MyHTMLParser.term.yellow, 'rev': MyHTMLParser.term.bold, 'cmd': MyHTMLParser.term.cyan + self.term.underline, # 'sub': term.cyan, 'echo': MyHTMLParser.term.yellow, } self.style_stack = []
def _internal_close(self): if (self._wb_parse_context): end_tag = '</' + self._wb_parse_context + '>' self.feed(end_tag) self._wb_parse_context = None # if haven't insert head_insert, but wrote some content # out, then insert head_insert now if self.head_insert and self.parsed_any: self.out.write(self.head_insert) self.head_insert = None try: HTMLParser.close(self) except Exception: # pragma: no cover # only raised in 2.6 pass
class Feed: # Class to handle Feeds def __init__(self, data, markup): self.obj = BeautifulSoup(data, markup) self.html_parser = HTMLParser() def getFeeds(self): # instantiate feeds = {} # get title feeds['title'] = self.getTitle() # get link feeds['link'] = self.getLink() # get items feeds['items'] = self.setupItems() return feeds def getTitle(self): return self.obj.title.string def getLink(self): return self.obj.find('link').string def getItems(self): return self.obj.find_all('item') def setupItems(self): items = self.getItems() data = [] for item in items: new_item = { 'title': self.html_parser.unescape( item.title.string ), 'link': item.find("link").string, 'comments_link': item.find("comments"), 'publication_date': item.find('pubDate').text, 'author': self.html_parser.unescape( item.find('creator').text ) } data.append(new_item) return data
def __init__(self, settings, filename): try: # Python 3.4+ HTMLParser.__init__(self, convert_charrefs=False) except TypeError: HTMLParser.__init__(self) self.body = '' self.metadata = {} self.settings = settings self._data_buffer = '' self._filename = filename self._in_top_level = True self._in_head = False self._in_title = False self._in_body = False self._in_tags = False
def __init__(self, typogrify, html_doc): self.html_doc = html_doc.strip() try: # Python 3.4+ HTMLParser.__init__(self, convert_charrefs=False) except TypeError: HTMLParser.__init__(self) # Mark the new line positions - needed to # determine the position within the input string # # ACTUALLY - we should use StringIO here instead new_line = 1 self.new_line_pos[new_line] = 0 for index, char in enumerate(self.html_doc): if char == "\n": new_line += 1 # Add one due to index being zero based self.new_line_pos[new_line] = index + 1 self.typogrify = typogrify self.feed(self.html_doc) # start parsing
def save(self, page, crawl=False): """Requests and saves a remote page to a local file """ print('Saving '+page) html = self.fetch(page, crawl) content = html.replace('</p>', '\n') content = re.sub(r'<.*?>', ' ', content) content = HTMLParser().unescape(content) content = content.encode('utf8') if self.out_file: with open(self.out_file, 'a') as handle: handle.write(content+'\n') elif self.out_dir: page_key = self.page_key(page) try: os.makedirs(os.path.join(self.out_dir, page_key[0])) except OSError: pass filename = os.path.join(self.out_dir, page_key[0], (page_key[1]+'?'+page_key[2]).replace('/', '_')) with open(filename, 'w') as handle: handle.write(content)