Python HTMLParser.unescape 예제들, six.moves.html_parser.HTMLParser.unescape Python 예제들

예제 #1

0

파일 보기

파일: content.py 프로젝트: ghibleh/plugin.video.viervijfzes

    def get_episode(self, channel, path):
        """ Get a Episode object from the specified page.
        :type channel: str
        :type path: str
        :rtype Episode
        NOTE: This function doesn't use an API.
        """
        if channel not in CHANNELS:
            raise Exception('Unknown channel %s' % channel)

        # Load webpage
        page = self._get_url(CHANNELS[channel]['url'] + '/' + path)

        # Extract program JSON
        parser = HTMLParser()
        regex_program = re.compile(r'data-hero="([^"]+)', re.DOTALL)
        json_data = parser.unescape(regex_program.search(page).group(1))
        data = json.loads(json_data)['data']
        program = self._parse_program_data(data)

        # Extract episode JSON
        regex_episode = re.compile(
            r'<script type="application/json" data-drupal-selector="drupal-settings-json">(.*?)</script>',
            re.DOTALL)
        json_data = parser.unescape(regex_episode.search(page).group(1))
        data = json.loads(json_data)

        # Lookup the episode in the program JSON based on the nodeId
        # The episode we just found doesn't contain all information
        for episode in program.episodes:
            if episode.nodeid == data['pageInfo']['nodeId']:
                return episode

        return None

예제 #2

0

파일 보기

파일: processors.py 프로젝트: TiendaOnline2020/Proyecto_Privado

def ctx_dict(request):
    context = {}
    if not Region.objects.all().exists():
        for valor_region in Lista_regiones:
            valor_region_lista = valor_region.split(',')
            objeto_region = Region()
            objeto_region.id = int(valor_region_lista[0])
            h = HTMLParser()
            objeto_region.Nombre = str(h.unescape(valor_region_lista[1].replace("'", ""))).lower().capitalize()
            objeto_region.save()
    if not Provincia.objects.all().exists():
        for valor_provincia in Lista_provincia:
            valor_provincia_lista = valor_provincia.split(',')
            objeto_provincia = Provincia()
            objeto_provincia.id = int(valor_provincia_lista[0])
            h = HTMLParser()
            objeto_provincia.region_id = valor_provincia_lista[2]
            objeto_provincia.Nombre = str(h.unescape(valor_provincia_lista[1].replace("'", ""))).lower().capitalize()
            objeto_provincia_region = Region.objects.get(id=int(valor_provincia_lista[2]))
            objeto_provincia.region_provincia = objeto_provincia_region
            objeto_provincia.save()
    if not Distrito.objects.all().exists():
        for valor_distrito in Lista_distrito:
            valor_distrito_lista = valor_distrito.split(',')
            objeto_distrito = Distrito()
            objeto_distrito.id = int(valor_distrito_lista[0])
            h = HTMLParser()
            objeto_distrito.Nombre = str(h.unescape(valor_distrito_lista[1].replace("'", ""))).lower().capitalize()
            objeto_distrito.provincia_id = int(valor_distrito_lista[2])
            objeto_distrito_provincia = Provincia.objects.get(id=int(valor_distrito_lista[2]))
            objeto_distrito.provincia_distrito = objeto_distrito_provincia
            objeto_distrito.save()

    return context

예제 #3

0

파일 보기

파일: client.py 프로젝트: mickeydeez/GlitterBot

 def extended_stats(self, user=None):
     if not user:
         data = self.api.me()
     else:
         if isinstance(user, str):
             data = self.api.get_user('%s' % str(user.replace('@', '')))
         else:
             raise InvalidParameter
     logging.info("[*] Created: %s" % data.created_at)
     logging.info("[*] Description: %s" % data.description)
     logging.info("[*] Last update: %s" % data.status.created_at)
     hashtags = ' '.join(
         [ "#%s" % x['text'] for x in \
          data.status.entities['hashtags']]
     )
     mentions = ' '.join(
         [ "@%s" % x['screen_name'] for x in \
             data.status.entities['user_mentions']]
     )
     logging.info("[*] \tUser Mentions: %s" % mentions)
     logging.info("[*] \tHashtags: %s" % hashtags)
     html = HTMLParser()
     if "RT @" in data.status.text:
         logging.info(
             "[*] \tRetweet Text: %s" %
             html.unescape(data.status.text.replace('\n', '\n\t\t    ')))
     else:
         logging.info(
             "[*] \tTweet Text: %s" %
             html.unescape(data.status.text.replace('\n', '\n\t\t    ')))
     logging.info('[*] \tRetweet Count: %s' %
                  str(data.status.retweet_count))

예제 #4

0

파일 보기

파일: atlassian.py 프로젝트: jaeko44/waldur-mastermind

    def _issue_to_dict(self, issue):
        """ Convert issue to dict that can be accepted by JIRA as input parameters """
        caller = issue.caller.full_name or issue.caller.username
        parser = HTMLParser()
        args = {
            'project':
            self.project_settings['key'],
            'summary':
            parser.unescape(issue.summary),
            'description':
            parser.unescape(issue.description),
            'issuetype': {
                'name': issue.type
            },
            self._get_field_id_by_name(self.issue_settings['caller_field']):
            caller,
        }

        if issue.reporter:
            args[self._get_field_id_by_name(
                self.issue_settings['reporter_field'])] = issue.reporter.name
        if issue.impact:
            args[self._get_field_id_by_name(
                self.issue_settings['impact_field'])] = issue.impact
        if issue.priority:
            args['priority'] = {'name': issue.priority}
        return args

예제 #5

0

파일 보기

파일: parselypage.py 프로젝트: Parsely/schemato

 def original_unescape(self, s):
     """Since we need to use this sometimes"""
     if isinstance(s, basestring):
         return unicode(HTMLParser.unescape(self, s))
     elif isinstance(s, list):
         return [unicode(HTMLParser.unescape(self, item)) for item in s]
     else:
         return s

예제 #6

0

파일 보기

파일: content.py 프로젝트: mroizo82/plugin.video.viervijfzes

    def _extract_programs(html, channel):
        """ Extract Programs from HTML code """
        parser = HTMLParser()

        # Item regexes
        regex_item = re.compile(
            r'<a[^>]+?href="(?P<path>[^"]+)"[^>]+?>'
            r'.*?<h3 class="poster-teaser__title"><span>(?P<title>[^<]*)</span></h3>.*?'
            r'</a>', re.DOTALL)

        # Extract items
        programs = []
        for item in regex_item.finditer(html):
            path = item.group('path')
            if path.startswith('/video'):
                continue

            title = parser.unescape(item.group('title'))

            # Program
            programs.append(
                Program(
                    path=path.lstrip('/'),
                    channel=channel,
                    title=title,
                ))

        return programs

예제 #7

0

파일 보기

def get_link(url):
    if 'apitvh.net' in url \
            or 'tvhayz.net' in url \
            or 'tvhays.org' in url \
            or 'tvhai.org' in url \
            :
        url = re.search(r'\?link=(.*)', url).group(1)

    response = Request().get(url)
    m = re.search('data-options="(.+?)"', response)
    h = HTMLParser()
    try:
        s = m.group(1)
    except:
        raise Exception("Link has been removed")
    s = h.unescape(s)
    s = json.loads(s)
    s = json.loads(s['flashvars']['metadata'])
    items = [(i['url'], rsl(i['name'])) for i in s['videos']]
    items = sorted(items, key=lambda elem: int(elem[1]), reverse=True)

    if len(items) == 1:
        return items[0]

    listitems = []
    for i in items:
        listitems.append("%s (%s)" % (i[1], i[0]))
    index = xbmcgui.Dialog().select("Select ok.ru stream", listitems)
    if index == -1:
        return None, None
    else:
        return items[index]

예제 #8

0

파일 보기

 def _prepare_message(self, message):
     # slack don't process html entities
     html_parser = HTMLParser()
     message = html_parser.unescape(message)
     # slack also don't render html itself
     message = strip_tags(message)
     return message

예제 #9

0

파일 보기

파일: events.py 프로젝트: NewAcropolis/frontend

def _get_event():
    event = [e for e in session['events'] if e['id'] == request.args.get('event')]
    if event:
        h = HTMLParser()
        event[0]['description'] = h.unescape(event[0]['description'])
        return jsonify(event[0])
    return ''

예제 #10

0

파일 보기

파일: congressgov.py 프로젝트: jhurwitz/indivisible

    def get_hot_bills(self):
        """
        Get list of most viewed bills from last week

        @return: list of dicts of the form: {
            'congress': which # congress,
            'number': bill #,
            'title': short text,
        }

        """
        soup = self._get("Most-Viewed+Bills")
        table = soup.find("table", class_="confluenceTable")
        if table:
            to_ret = []
            rows = table.findAll("tr")
            h = HTMLParser()
            for row in rows:
                bills = {}
                columns = row.findAll("td")
                if columns and len(columns) == 3:
                    bill = {}
                    bill['congress'] = re.search(
                        r"\[(\d+)\w+\]", columns[1].contents[1]).groups()[0]
                    bill['congress'] = int(bill['congress'])
                    bill['number'] = columns[1].find("a").contents[0].strip()
                    bill['title'] = h.unescape(
                        re.sub(r"\"", "", columns[2].contents[0]))
                    to_ret.append(bill)
            return to_ret

예제 #11

0

파일 보기

 def convert_to_colour_list(cls, colours, *args, **kwargs):
     """
     Takes a whole munge of nonsense input, converts it into a list of colours.
     Will split apart comma delimited strings. Will decode HTML chars. Will
     concatenate a mixture of comma strings and items
     """
     colours = copy.deepcopy(colours)  # Ensure we don't bugger up original
     if isinstance(colours, (str, unicode)):
         colours = [colours]  # Listify
     colours.extend(args)
     intermediate_list = []
     # Add in comma delimited stuff
     h = HTMLParser()
     for colour_term in colours:
         if isinstance(colour_term, (str, unicode)):
             colour_term_decoded = h.unescape(
                 colour_term)  # HTML char decode
             colour_terms_list = colour_term_decoded.split(",")
             intermediate_list.extend(colour_terms_list)
         else:
             intermediate_list.append(colour_term)
     # Now sanitise the list again
     output_list = []
     for colour in intermediate_list:
         if isinstance(colour, (str, unicode)):
             colour_clean = colour.strip()
         output_list.append(colour)
     return output_list

예제 #12

0

파일 보기

def strip_tags(string, allowed_tags=''):
    if allowed_tags != '':
    # Get a list of all allowed tag names.
        allowed_tags_list = re.sub(r'[\\/<> ]+', '', allowed_tags).split(',')
        allowed_pattern = ''
        for s in allowed_tags_list:
            if s == '':
                continue;
            # Add all possible patterns for this tag to the regex.
            if allowed_pattern != '':
                allowed_pattern += '|'
            allowed_pattern += '<' + s + ' [^><]*>$|<' + s + '>|'
        # Get all tags included in the string.
        all_tags = re.findall(r'<]+>', string, re.I)
        for tag in all_tags:
            # If not allowed, replace it.
            if not re.match(allowed_pattern, tag, re.I):
                string = string.replace(tag, '')
    else:
        # If no allowed tags, remove all.
        string = re.sub(r'<[^>]*?>', '', string)

    h = HTMLParser()
    string = h.unescape(string)
 
    return string

예제 #13

0

파일 보기

파일: scrape_denied_courses.py 프로젝트: gordonje/ncaa-high-school-course-scraper

def parse_denied_courses(school_html):
    root = fromstring(school_html)
    denied_table = root.cssselect('#NcaaCrs_DeniedCategory_All')
    courses = []
    for tr in denied_table[0].cssselect('tr')[1:]:
        tables = tr.cssselect('table')
        try:
            subject = tables[0].cssselect('.hs_tableHeader')[0].text_content()
        except IndexError:
            continue

        for course_tr in tables[1].cssselect('tbody tr'):
            course = {}

            tds = course_tr.cssselect('td')

            course['subject'] = subject
            course['course_weight'] = tds[0].text_content().strip()
            h = HTMLParser()
            course['title'] = h.unescape(tds[1].text_content().strip())
            course['notes'] = tds[2].text_content().strip()
            course['max_credits'] = tds[3].text_content().strip()
            course['ok_through'] = tds[4].text_content().strip()
            course['reason_code'] = tds[5].text_content().strip()
            course['disability_course'] = tds[6].text_content().strip()

            courses.append(course)

    return courses

예제 #14

0

파일 보기

파일: content.py 프로젝트: dagwieers/plugin.video.viervijfzes

    def get_programs(self, channel):
        """ Get a list of all programs of the specified channel.
        :type channel: str
        :rtype list[Program]
        NOTE: This function doesn't use an API.
        """
        if channel not in CHANNELS:
            raise Exception('Unknown channel %s' % channel)

        # Load webpage
        data = self._get_url(CHANNELS[channel]['url'])

        # Parse programs
        h = HTMLParser()
        regex_programs = re.compile(
            r'<a class="program-overview__link" href="(?P<path>[^"]+)">\s+'
            r'<span class="program-overview__title">\s+(?P<title>[^<]+)</span>.*?'
            r'</a>', re.DOTALL)

        programs = [
            Program(channel=channel,
                    path=program.group('path').lstrip('/'),
                    title=h.unescape(program.group('title').strip()))
            for program in regex_programs.finditer(data)
        ]

        return programs

예제 #15

0

파일 보기

파일: flickr.py 프로젝트: npmcdn-to-unpkg-bot/Photini

 def add_set(self, title, description, index=-1):
     widget = QtWidgets.QCheckBox(title.replace('&', '&&'))
     if description:
         h = HTMLParser()
         widget.setToolTip(h.unescape(description))
     if index >= 0:
         self.sets_widget.layout().insertWidget(index, widget)
     else:
         self.sets_widget.layout().addWidget(widget)
     return widget

예제 #16

0

파일 보기

파일: flickr.py 프로젝트: jim-easterbrook/Photini

 def add_set(self, title, description, index=-1):
     widget = QtWidgets.QCheckBox(title.replace('&', '&&'))
     if description:
         h = HTMLParser()
         widget.setToolTip(h.unescape(description))
     if index >= 0:
         self.sets_widget.layout().insertWidget(index, widget)
     else:
         self.sets_widget.layout().addWidget(widget)
     return widget

예제 #17

0

파일 보기

파일: showrss.py 프로젝트: JackDandy/SickGear

 def logged_in(self, y):
     if all([None is y or 'logout' in y,
             bool(filter(lambda c: 'remember_web_' in c, self.session.cookies.keys()))]):
         if None is not y:
             self.shows = dict(re.findall('<option value="(\d+)">(.*?)</option>', y))
             h = HTMLParser()
             for k, v in self.shows.items():
                 self.shows[k] = sanitizeSceneName(h.unescape(unidecode(v.decode('utf-8'))))
         return True
     return False

예제 #18

0

파일 보기

파일: run.py 프로젝트: hect1c/rss-crawler

class Feed:
    # Class to handle Feeds
    def __init__(self, data, markup):
        self.obj = BeautifulSoup(data, markup)
        self.html_parser = HTMLParser()

    def getFeeds(self):
        # instantiate
        feeds = {}

        # get title
        feeds['title'] = self.getTitle()
        # get link
        feeds['link'] = self.getLink()
        # get items
        feeds['items'] = self.setupItems()

        return feeds

    def getTitle(self):
        return self.obj.title.string

    def getLink(self):
        return self.obj.find('link').string

    def getItems(self):
        return self.obj.find_all('item')

    def setupItems(self):
        items = self.getItems()
        data = []

        for item in items:
            new_item = {
                'title': self.html_parser.unescape( item.title.string ),
                'link': item.find("link").string,
                'comments_link': item.find("comments"),
                'publication_date': item.find('pubDate').text,
                'author': self.html_parser.unescape( item.find('creator').text )
            }
            data.append(new_item)

        return data

예제 #19

0

파일 보기

파일: check_tests.py 프로젝트: sshnaidm/various

def check_bz_bug(b):
    ''' Return status of a bug in BZ'''
    html = get_html(b)
    if html:
        text = html.content.decode('utf-8')
        name = TITLE.search(text).group(1) if TITLE.search(text) else ''
        h = HTMLParser()
        name = h.unescape(name)
    else:
        name = ''
    return name, None

예제 #20

0

파일 보기

파일: tmp.py 프로젝트: arxcruz/tempest-tool

def check_bz_bug(b):
    ''' Return status of a bug in BZ'''
    html = get_html(b)
    if html:
        text = html.content.decode('utf-8')
        name = TITLE.search(text).group(1) if TITLE.search(text) else ''
        h = HTMLParser()
        name = h.unescape(name)
    else:
        name = ''
    return name, None

예제 #21

0

파일 보기

파일: global_search.py 프로젝트: vhrspvl/vhrs-frappe

def get_formatted_value(value, field):
	'''Prepare field from raw data'''

	from six.moves.html_parser import HTMLParser

	if(getattr(field, 'fieldtype', None) in ["Text", "Text Editor"]):
		h = HTMLParser()
		value = h.unescape(value)
		value = (re.subn(r'<[\s]*(script|style).*?</\1>(?s)', '', text_type(value))[0])
		value = ' '.join(value.split())
	return field.label + " : " + strip_html_tags(text_type(value))

예제 #22

0

파일 보기

파일: book-load-csv.py 프로젝트: culibraries/ir-scholar

def clean_abstract_text(html):
    # Super and Sub scripts insertion
    text = super_sub_script_replace(html, '<sub>', '</sub>')
    text = super_sub_script_replace(text, '<sup>', '</sup>')
    # Translate special characters
    h = HTMLParser()
    text = h.unescape(text)
    # Remove all tags remaining
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    return text

예제 #23

0

파일 보기

파일: bugs.py 프로젝트: arxcruz/tempest-tool

 def bug_status(self):
     ''' Return status of a bug in BZ'''
     html = utils.get_html(self.url)
     if html:
         text = html.content.decode('utf-8')
         name = TITLE.search(text).group(1) if TITLE.search(text) else ''
         h = HTMLParser()
         name = h.unescape(name)
     else:
         name = ''
     return name, None

예제 #24

0

파일 보기

파일: propublica.py 프로젝트: jhurwitz/indivisible

    def get_committees(self, congress, chamber):
        """
        Get committees for given congress and chamber
        """
        results = self._get("{congress}/{chamber}/committees.json".format(
            congress=congress, chamber=chamber.lower()))
        ret = results[0]['committees'] if results else []

        h = HTMLParser()
        for c in ret:
            c['name'] = h.unescape(c['name'])
        return ret

예제 #25

0

파일 보기

파일: models.py 프로젝트: Nomadblue/django-nomad-notifier

 def get_email_subject(self):
     """
     WARNING: It is MANDATORY to override method if you are going to
     send email using the  `send_notification_email` method.
     Your class must define an `email_subject_tmpl` attribute
     containing a template path to a file that has your email subject.
     """
     # Convert the html back to plaintext after rendering it using template
     # to get rid of html ampersand character codes
     parser = HTMLParser()
     html_email = self._get_email_field('email_subject_tmpl', 'get_email_subject')
     return parser.unescape(html_email)

예제 #26

0

파일 보기

파일: global_search.py 프로젝트: elba7r/frameworking-v9

def get_formatted_value(value, field):
    '''Prepare field from raw data'''

    from six.moves.html_parser import HTMLParser

    if (getattr(field, 'fieldtype', None) in ["Text", "Text Editor"]):
        h = HTMLParser()
        value = h.unescape(value)
        value = (re.subn(r'<[\s]*(script|style).*?</\1>(?s)', '',
                         text_type(value))[0])
        value = ' '.join(value.split())
    return field.label + " : " + strip_html_tags(text_type(value))

예제 #27

0

파일 보기

파일: html_rewriter.py 프로젝트: mirrorweb/pywb

    def try_unescape(self, value):
        if '&#' not in value:
            return value

        try:
            if orig_unescape:
                new_value = orig_unescape(value)
            else:
                new_value = HTMLParser.unescape(self, value)
        except Exception as e:
            return value

        return new_value

예제 #28

0

파일 보기

파일: html_rewriter.py 프로젝트: whitten/pywb

    def try_unescape(self, value):
        if not value.startswith('http'):
            return value

        try:
            if orig_unescape:
                new_value = orig_unescape(value)
            else:
                new_value = HTMLParser.unescape(self, value)
        except Exception as e:
            return value

        return new_value

예제 #29

0

파일 보기

파일: html_rewriter.py 프로젝트: ikreymer/pywb

    def try_unescape(self, value):
        if not value.startswith('http'):
            return value

        try:
            if orig_unescape:
                new_value = orig_unescape(value)
            else:
                new_value = HTMLParser.unescape(self, value)
        except Exception as e:
            return value

        return new_value

예제 #30

0

파일 보기

파일: download.py 프로젝트: Thibs29/PhotoboxDownload

def get_AllPictureFromAlbum(album_id, cookie):
    '''
        Get all picture from album_id
    '''
    list_pics = []
    h = HTMLParser()
    base_url = 'https://www.photobox.fr/mon-photobox/album?album_id='

    r = request_url(base_url + album_id, cookie)
    soup = BeautifulSoup(r.content)
    nb_pages = len(
        soup.findAll("div", {'class': re.compile(r'^pbx_pagination$')}))
    '''
    
    '''
    blocks = soup.findAll("div", {'class': re.compile(r'pbx_photo_thumb')})
    for block in blocks:
        picture_url = block.h4.a['href']
        parsed = urlparse.urlparse(picture_url)
        picture_name = block.div.img['title']
        list_pics.append(
            (album_id, urlparse.parse_qs(parsed.query)['photo_id'][0],
             h.unescape(picture_name)))

    for page in range(1, nb_pages):
        base_url_pagination = "https://www.photobox.fr/includes/ajax/my/album/content?cat=album&action=paginate&album_id=" + album_id + "&page=" + str(
            page)
        r = request_url(base_url_pagination + album_id, cookie)
        soup = BeautifulSoup(r.content)
        blocks = soup.findAll("div", {'class': re.compile(r'pbx_photo_thumb')})
        for block in blocks:
            picture_url = block.h4.a['href']
            parsed = urlparse.urlparse(picture_url)
            picture_name = block.div.img['title']
            list_pics.append(
                (album_id, urlparse.parse_qs(parsed.query)['photo_id'][0],
                 h.unescape(picture_name)))
    return list_pics

예제 #31

0

파일 보기

파일: test_utils.py 프로젝트: edx-olive/campus-social-auth

    def test_generate_username_unicode(self):
        """
        Ensure that unique usernames can be generated from unicode base names.

        Uses HTML escaping to accurately reproduce an issue experienced with unicode names in decoded HTML requests.
        """
        h = HTMLParser()
        escaped_name = '&#1495;&#1497;&#1497;&#1501;_&#1506;&#1502;&#1512;&#1504;&#1497;'
        username = h.unescape(escaped_name)
        user_exists = User.objects.create(username=username)

        generator = UsernameGenerator()
        new_username = generator.generate_username(username)
        return self.assertEqual(new_username, u'{}_1'.format(username))

예제 #32

0

파일 보기

파일: content.py 프로젝트: mroizo82/plugin.video.viervijfzes

        def update():
            """ Fetch the program metadata by scraping """
            # Load webpage
            page = self._get_url(CHANNELS[channel]['url'] + '/' + path)

            parser = HTMLParser()
            program_json = None
            episode_json = None

            # Extract video JSON by looking for a data-video tag
            # This is not present on every page
            regex_video_data = re.compile(r'data-video="([^"]+)"', re.DOTALL)
            result = regex_video_data.search(page)
            if result:
                video_id = json.loads(parser.unescape(result.group(1)))['id']
                video_json_data = self._get_url(
                    '%s/video/%s' % (self.SITE_APIS[channel], video_id))
                video_json = json.loads(video_json_data)
                return dict(video=video_json)

            # Extract program JSON
            regex_program = re.compile(r'data-hero="([^"]+)', re.DOTALL)
            result = regex_program.search(page)
            if result:
                program_json_data = parser.unescape(result.group(1))
                program_json = json.loads(program_json_data)['data']

            # Extract episode JSON
            regex_episode = re.compile(
                r'<script type="application/json" data-drupal-selector="drupal-settings-json">(.*?)</script>',
                re.DOTALL)
            result = regex_episode.search(page)
            if result:
                episode_json_data = parser.unescape(result.group(1))
                episode_json = json.loads(episode_json_data)

            return dict(program=program_json, episode=episode_json)

예제 #33

0

파일 보기

파일: preprocessors.py 프로젝트: eregs/regulations-parser

def replace_html_entities(xml_bin_str):
    """XML does not contain entity references for many HTML entities, yet the
    Federal Register XML sometimes contains the HTML entities. Replace them
    here, lest we throw off XML parsing"""
    parser = HTMLParser()
    match = HTML_RE.search(xml_bin_str)
    while match:
        match_bin = match.group(0)
        match_str = match_bin.decode('utf-8')
        replacement = parser.unescape(match_str).encode('UTF-8')
        logger.debug("Replacing %s with %s in retrieved XML",
                     match_str, replacement)
        xml_bin_str = xml_bin_str.replace(match_bin, replacement)
        match = HTML_RE.search(xml_bin_str)
    return xml_bin_str

예제 #34

0

파일 보기

파일: html_rewriter.py 프로젝트: rebeccacremona/pywb

    def try_unescape(self, value):
        if not value.startswith('http'):
            return value

        try:
            new_value = HTMLParser.unescape(self, value)
        except:
            return value

        if value != new_value:
            # ensure utf-8 encoded to avoid %-encoding query here
            if isinstance(new_value, text_type):
                new_value = new_value.encode('utf-8')

        return new_value

예제 #35

0

파일 보기

파일: preprocessors.py 프로젝트: govtmirror/regulations-parser-1

def replace_html_entities(xml_bin_str):
    """XML does not contain entity references for many HTML entities, yet the
    Federal Register XML sometimes contains the HTML entities. Replace them
    here, lest we throw off XML parsing"""
    parser = HTMLParser()
    match = HTML_RE.search(xml_bin_str)
    while match:
        match_bin = match.group(0)
        match_str = match_bin.decode('utf-8')
        replacement = parser.unescape(match_str).encode('UTF-8')
        logger.debug("Replacing %s with %s in retrieved XML", match_str,
                     replacement)
        xml_bin_str = xml_bin_str.replace(match_bin, replacement)
        match = HTML_RE.search(xml_bin_str)
    return xml_bin_str

예제 #36

0

파일 보기

 def logged_in(self, y):
     if all([
             None is y or 'logout' in y,
             bool(
                 filter(lambda c: 'remember_web_' in c,
                        self.session.cookies.keys()))
     ]):
         if None is not y:
             self.shows = dict(
                 re.findall('<option value="(\d+)">(.*?)</option>', y))
             h = HTMLParser()
             for k, v in self.shows.items():
                 self.shows[k] = sanitizeSceneName(
                     h.unescape(unidecode(v.decode('utf-8'))))
         return True
     return False

예제 #37

0

파일 보기

파일: download.py 프로젝트: Thibs29/PhotoboxDownload

def get_AllAlbumsFormPhotobox(cookie):
    '''
        Get all albums 
    '''
    list_albums = []
    h = HTMLParser()
    base_url = 'https://www.photobox.fr/mon-photobox/albums'

    r = request_url(base_url, cookie)
    soup = BeautifulSoup(r.content)
    blocks = soup.findAll("a", {'class': re.compile(r'^pbx_object_title$')})
    for block in blocks:
        album_name = block['title']
        parsed = urlparse.urlparse(block['href'])
        album_id = urlparse.parse_qs(parsed.query)['album_id'][0]
        list_albums.append((album_id, h.unescape(album_name)))
    return list_albums

예제 #38

0

파일 보기

파일: client.py 프로젝트: thehappydinoa/fecho

class Client(object):
    def __init__(self, cookie):
        self.cookie = format_cookie(cookie)
        self.headers = {
            'Host': 'developers.facebook.com',
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'deflate',
            'Connection': 'keep-alive',
            'Cookie': self.cookie,
            'Upgrade-Insecure-Requests': '1',
            'Cache-Control': 'max-age=0',
            'TE': 'Trailers'
        }
        self.html_parser = HTMLParser()

    def unescape(self, html):
        return self.html_parser.unescape(html)

    def get(self, url, **kwargs):
        if kwargs.get("params"):
            url += "?" + requests.compat.urlencode(kwargs.get("params"))
            kwargs.pop("params")

        parsed_url = requests.compat.urlparse(url)
        if not parsed_url.scheme:
            parsed_url._replace(**{"scheme": "http"})
        if not parsed_url.path:
            parsed_url._replace(**{"path": "/"})
        if not parsed_url.netloc:
            raise InvalidURL(parsed_url.geturl())

        escaped_url = requests.compat.quote_plus(parsed_url.geturl())
        response = requests.get(
            'https://developers.facebook.com/tools/debug/echo/?q=%s' %
            escaped_url,
            headers=self.headers,
            **kwargs)
        handle_response(response)
        return response

예제 #39

0

파일 보기

파일: content.py 프로젝트: mroizo82/plugin.video.viervijfzes

        def update():
            """ Fetch the program listing by scraping """
            # Load webpage
            raw_html = self._get_url(CHANNELS[channel]['url'])

            # Parse programs
            parser = HTMLParser()
            regex_programs = re.compile(
                r'<a class="program-overview__link" href="(?P<path>[^"]+)">\s+'
                r'<span class="program-overview__title">\s+(?P<title>[^<]+)</span>.*?'
                r'</a>', re.DOTALL)
            data = {
                item.group('path').lstrip('/'):
                parser.unescape(item.group('title').strip())
                for item in regex_programs.finditer(raw_html)
            }

            if not data:
                raise Exception('No programs found for %s' % channel)

            return data

예제 #40

0

파일 보기

파일: utils.py 프로젝트: cvium/irc_bot

def decode_html(data):
    """Decode dumb html"""
    h = HTMLParser()
    return h.unescape(data)

예제 #41

0

파일 보기

파일: html_utils.py 프로젝트: janderse/juriscraper

def html_unescape(s):
    h = HTMLParser()
    return h.unescape(s)

예제 #42

0

파일 보기

파일: utils.py 프로젝트: diox/olympia

def notify_about_activity_log(addon, version, note, perm_setting=None,
                              send_to_reviewers=True, send_to_staff=True):
    """Notify relevant users about an ActivityLog note."""
    comments = (note.details or {}).get('comments')
    if not comments:
        # Just use the name of the action if no comments provided.  Alas we
        # can't know the locale of recipient, and our templates are English
        # only so prevent language jumble by forcing into en-US.
        with translation.override(settings.LANGUAGE_CODE):
            comments = '%s' % amo.LOG_BY_ID[note.action].short
    else:
        htmlparser = HTMLParser()
        comments = htmlparser.unescape(comments)

    # Collect add-on authors (excl. the person who sent the email.) and build
    # the context for them.
    addon_authors = set(addon.authors.all()) - {note.user}

    author_context_dict = {
        'name': addon.name,
        'number': version.version,
        'author': note.author_name,
        'comments': comments,
        'url': absolutify(addon.get_dev_url('versions')),
        'SITE_URL': settings.SITE_URL,
        'email_reason': 'you are listed as an author of this add-on',
        'is_info_request': note.action == amo.LOG.REQUEST_INFORMATION.id,
    }

    # Not being localised because we don't know the recipients locale.
    with translation.override('en-US'):
        if note.action == amo.LOG.REQUEST_INFORMATION.id:
            if addon.pending_info_request:
                days_left = (
                    # We pad the time left with an extra hour so that the email
                    # does not end up saying "6 days left" because a few
                    # seconds or minutes passed between the datetime was saved
                    # and the email was sent.
                    addon.pending_info_request + timedelta(hours=1) -
                    datetime.now()
                ).days
                if days_left > 9:
                    author_context_dict['number_of_days_left'] = (
                        '%d days' % days_left)
                elif days_left > 1:
                    author_context_dict['number_of_days_left'] = (
                        '%s (%d) days' % (apnumber(days_left), days_left))
                else:
                    author_context_dict['number_of_days_left'] = 'one (1) day'
            subject = u'Mozilla Add-ons: Action Required for %s %s' % (
                addon.name, version.version)
            reviewer_subject = u'Mozilla Add-ons: %s %s' % (
                addon.name, version.version)
        else:
            subject = reviewer_subject = u'Mozilla Add-ons: %s %s' % (
                addon.name, version.version)
    # Build and send the mail for authors.
    template = template_from_user(note.user, version)
    from_email = formataddr((note.author_name, NOTIFICATIONS_FROM_EMAIL))
    send_activity_mail(
        subject, template.render(author_context_dict),
        version, addon_authors, from_email, note.id, perm_setting)

    if send_to_reviewers or send_to_staff:
        # If task_user doesn't exist that's no big issue (i.e. in tests)
        try:
            task_user = {get_task_user()}
        except UserProfile.DoesNotExist:
            task_user = set()

    if send_to_reviewers:
        # Collect reviewers on the thread (excl. the email sender and task user
        # for automated messages), build the context for them and send them
        # their copy.
        log_users = {
            alog.user for alog in ActivityLog.objects.for_version(version) if
            acl.is_user_any_kind_of_reviewer(alog.user)}
        reviewers = log_users - addon_authors - task_user - {note.user}
        reviewer_context_dict = author_context_dict.copy()
        reviewer_context_dict['url'] = absolutify(
            reverse('reviewers.review',
                    kwargs={
                        'addon_id': version.addon.pk,
                        'channel': amo.CHANNEL_CHOICES_API[version.channel]
                    }, add_prefix=False))
        reviewer_context_dict['email_reason'] = 'you reviewed this add-on'
        send_activity_mail(
            reviewer_subject, template.render(reviewer_context_dict),
            version, reviewers, from_email, note.id, perm_setting)

    if send_to_staff:
        # Collect staff that want a copy of the email, build the context for
        # them and send them their copy.
        staff = set(
            UserProfile.objects.filter(groups__name=ACTIVITY_MAIL_GROUP))
        staff_cc = (
            staff - reviewers - addon_authors - task_user - {note.user})
        staff_cc_context_dict = reviewer_context_dict.copy()
        staff_cc_context_dict['email_reason'] = (
            'you are member of the activity email cc group')
        send_activity_mail(
            reviewer_subject, template.render(staff_cc_context_dict),
            version, staff_cc, from_email, note.id, perm_setting)

예제 #43

0

파일 보기

파일: pages.py 프로젝트: P4ncake/weboob

 def filter(self, data):
     h = HTMLParser()
     txt = super(Entities, self).filter(data)
     return h.unescape(txt)