Python HTMLParser примеры, html.parser.HTMLParser Python примеры использования

Пример #1

0

Показать файл

Файл: TransParser.py Проект: MattDiesel/staticpress

	def __init__(self, strict = False, reps = None, outs = None, sc = True):
		self.rep = reps
		self.outStream = outs
		self.stripComment = sc
		self.rep.parser = self

		HTMLParser.__init__(self, strict)

Пример #2

0

Показать файл

Файл: toc.py Проект: ScxFiction/mkdocs

    def __init__(self):
        HTMLParser.__init__(self)
        self.links = []

        self.in_anchor = False
        self.attrs = None
        self.title = ''

Пример #3

0

Показать файл

Файл: checkJavadocLinks.py Проект: DDebray/lucene-solr

 def __init__(self, baseURL):
   HTMLParser.__init__(self)
   self.stack = []
   self.anchors = set()
   self.links = []
   self.baseURL = baseURL
   self.printed = False

Пример #4

0

Показать файл

Файл: sodexo_parser.py Проект: JelloRanger/menu-scraper

    def __init__(self):
        """An overload of the HTML Parser constructor.
        We use this initialization code to make sure that every
        variable is flushed.

        Arguments:

        self -- Allows the function to reference parent class
        properties. It is unnecessary to specify self during function
        calls as it is implied.
        """

        # Initialize the HTML Parser.
        HTMLParser.__init__(self)

        # Initialize the variables.
        self._record_name = False
        self._record_meal = False
        self._record_station = False
        self._record_attributes = False
        self._day = EMPTY_STRING
        self._meal = EMPTY_STRING
        self._station = EMPTY_STRING
        self._name_text = []
        self._station_text = []
        self._attributes = []

        # Hold all the dining hall menus.
        self.menu = []

Пример #5

0

Показать файл

Файл: HTMLTreeBuilder.py Проект: AlexStef/stef-sublime-conf

 def __init__(self, builder=None, encoding=None):
     self.__stack = []
     if builder is None:
         builder = ElementTree.TreeBuilder()
     self.__builder = builder
     self.encoding = encoding or "iso-8859-1"
     HTMLParser.__init__(self)

Пример #6

0

Показать файл

Файл: getbinpkg.py Проект: clickbeetle/portage-cb

	def __init__(self):

		warnings.warn("portage.getbinpkg.ParseLinks is deprecated",
			DeprecationWarning, stacklevel=2)

		self.PL_anchors = []
		html_parser_HTMLParser.__init__(self)

Пример #7

0

Показать файл

Файл: MultiStationTools.py Проект: ThomasTheBuilder/flood-watch

 def __init__(self):
     HTMLParser.__init__(self)
     self.collect_data = False
     self.bound = 20
     self.des_tag = "div"
     self.des_attr = ("id", "content")
     self.stations_info = None

Пример #8

0

Показать файл

Файл: acpi-update.py Проект: GuillaumeSeren/systemd

 def __init__(self):
     HTMLParser.__init__(self)
     self.state = State.NOWHERE
     self.data = ""
     self.pnpid = None
     self.company = None
     self.table = []

Пример #9

0

Показать файл

Файл: vk_auth.py Проект: arkichek/vk-mymusic

 def __init__(self):
     HTMLParser.__init__(self)
     self.url = None
     self.params = {}
     self.in_form = False
     self.form_parsed = False
     self.method = "GET"

Пример #10

0

Показать файл

Файл: mondotimes.py Проект: h4ck3rm1k3/letter-to-editor

 def __init__(self):
     HTMLParser.__init__(self)
     self.state = []
     self.href= ""
     self.obj = {}
     self.index = {}
     self.done = False

Пример #11

0

Показать файл

Файл: WSJScraper.py Проект: Bolanle/G54MIP

 def __init__(self):
     HTMLParser.__init__(self)
     self.data = dict()
     self.recordingAuthor = False
     self.recordingBody = False;
     self.data["body"] = ""
     self.save_tags = ['p', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5']

Пример #12

0

Показать файл

Файл: textutils.py Проект: cydrobolt/cslbot

def gen_slogan(msg):
    html = get('http://www.sloganizer.net/en/outbound.php', params={'slogan': msg})
    slogan = re.search('>(.*)<', html.text).group(1)
    parser = HTMLParser()
    slogan = parser.unescape(parser.unescape(slogan))
    slogan = slogan.replace('\\', '').strip()
    return slogan if slogan else gen_slogan(msg)

Пример #13

0

Показать файл

Файл: osvdb_threads.py Проект: deathholes/Vulnerability-Database

	def __init__(self,strict=False):

		# Constructor call of parent class.
		HTMLParser.__init__(self,strict)

		# Defining variables of this class.
		# There are 3 types of variables.
		# data_variables: these are the required information
		# data_check_variables: boolean values corresponding to each of the data_variables
		# 		to keep a check on the data been already extracted or not.
		# tag_check_variables: used for matching the proper format.
		self.h1=False
		self.desc=False
		self.description=' '
		self.sol=False
		self.solution=' '
		self.p=False;
		self.li=False;
		self.ref=False;
		self.references=' '
		self.cvss=False;
		self.cvss_score=0.0;
		self.cve=False
		self.cve_id=' '
		self.links=[]
		self.prod=False;
		self.products=[]
		self.last_h6=' '
		self.h6=False
		self.h7=False
		self.clas=False
		self.attack_from=' '
		self.attk=False
		self.impact=' '
		self.impt=False

Пример #14

0

Показать файл

Файл: main.py Проект: 0x1p2/spider-py

 def __init__(self, args):
     HTMLParser.__init__(self)
     self.root_url   = args.URL                                      # Original URL passed.
     self.netloc     = urllib.parse.urlparse(self.root_url).netloc   # Netloc of the URL.
     self.depth      = args.depth                                    # Distance (pages) to travel.
     self.timer      = args.time                                     # Amount of time per page.
     self.db         = MongoClient()[args.db][args.coll]               # Database that stores data.
     self.sub        = args.sub                                      # Subdirectory to set as root of webpage.
     self.verbose    = args.verbose                                  # Verbosity setting.
     # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #  
     self.key_terms      = ["buy", "sell", "trade", "trading"]
     self.count          = 0                 # Amount of pages processed.
     self.posts          = 0                 # Amount of posts scanned.
     self.urlBlacklist   = []                # Already completed URLS.
     self.urlDNU         = []                # Do not use URLS, duplicates.
     self.urlList        = [self.root_url]   # List of URLS to scan.
     self.items          = []                # Items to look for.
     self.discovered     = {}                # Items discovered + [URLs]
     self.BigDict        = {}                # Dictionary containing ThreadID + [URLS] <- urlDNU list?.
     # # # # # # # # # # # # # # # # # # # # #
     self.li_main            = False     # Start of play contribution
     self.blockquote_main    = False     # Start of the message
     self.div_quote_main     = False     # Start of Quote Container
     self.div_quote_xpand    = False     # Start of QuoteExpand
     self.blockquote_quote   = False     # Start of Quote Message
     self.text_lock          = True      # Locks the abilty to print text or use it.
     self.li_name            = None      # Name of original author
     self.blockquote_name    = None      # Name of person being quoted.
     # # # # # # # # # # # # # # # # # # #
     self.queryDB()      # Loads the self.items list.

Пример #15

0

Показать файл

Файл: task_mail.py Проект: Anhmike/canopsis

    def handle_task(self, job):
        user = job.get('user', 'root')
        group = job.get('group', 'root')
        mail = job.get('sender', None)

        account = Account(user=user, group=group, mail=mail)

        recipients = job.get('recipients', None)
        subject = ensure_unicode(job.get('subject', ''))
        body = ensure_unicode(job.get('body', ''))
        attachments = job.get('attachments', None)
        smtp_host = job.get('smtp_host', 'localhost')
        smtp_port = job.get('smtp_port', 25)
        html = job.get('html', False)

        template_data = job.get('jobctx', {})
        body = Template(body)(template_data)
        subject = Template(subject)(template_data)

        if not html:
            h = HTMLParser()
            body = h.unescape(body)
            subject = h.unescape(subject)

        # Execute the task
        return self.sendmail(
            account, recipients, subject, body, attachments, smtp_host,
            smtp_port, html)

Пример #16

0

Показать файл

Файл: 0015_carousel_slide.py Проект: bittner/djangocms-cascade

def forwards(apps, schema_editor):
    html_parser = HTMLParser()

    for cascade_element in CascadeElement.objects.all():
        if cascade_element.plugin_type != 'CarouselSlidePlugin':
            continue

        caption = cascade_element.glossary.get('caption')
        if not caption:
            continue

        text_element = add_plugin(cascade_element.placeholder, TextPlugin, cascade_element.language,
                                  target=cascade_element)

        old_body = html_parser.unescape(caption)
        new_body, count = _replace_text_body(
            old_body,
            input_pattern=r'<img ([^>]*)\bid="plugin_obj_(?P<pk>\d+)"([^>]*)/?>',
            output_tag='<cms-plugin {}></cms-plugin>',
            id_format='id="{}"',
        )
        text_element.body = new_body
        text_element.save()

        # TODO: need to be re-tested
        if False and count > 0:
            for link_element in CMSPlugin.objects.filter(parent_id__in=(cascade_element.id, cascade_element.parent_id), plugin_type='TextLinkPlugin'):
                # print("Move Link {} from {} -> {}".format(link_element.id, link_element.parent_id, text_element.id))
                link_element.move(text_element, pos='last-child')
                link_element.save()

Пример #17

0

Показать файл

Файл: scraper.py Проект: ZeusWPI/WikiTools

def get_images(current_title, title, titles_length):
    h = HTMLParser()
    print("Fetching images from %s... (%s/%s)" % (title, current_title + 1, titles_length))
    # Escape the title so we can create a valid link
    # title = title.replace('\'', '%27').replace(' ', '%20')
    # Repition is succes
    while True:
        try:
            page = urlopen(SOURCE_LOCATION % title).read().decode(ENCODING)
            break
        except IOError:
            print("\tServer's being lazy, retrying...")

    if not page:
        print("\tFailed to get %s's images!" % title)
        return []
    # Ignore redirects
    if search("#DOORVERWIJZING", page, I | M) is not None or search("#REDIRECT.*", page, I | M) is not None:
        print("\tSkipping redirecting page %s" % title)
        return []
    imagelinks = []
    parser = ImageLocater(imagelinks)

    page = h.unescape(page)

    try:
        parser.feed(page)
    except:
        print("%s is a malformatted page" % title)
        return []

    return imagelinks

Пример #18

0

Показать файл

Файл: moduleTelegram.py Проект: fernand0/scripts

    def publishPost(self, post, link, comment):
        logging.info("    Publishing in Telegram...")
        bot = self.tc
        title = post
        content = comment
        links = ""
        channel = self.channel

        from html.parser import HTMLParser
        h = HTMLParser()
        title = h.unescape(title)
        text = '<a href="'+link+'">'+title+ "</a>\n" + content + '\n\n' + links
        textToPublish2 = ""
        if len(text) < 4090:
            textToPublish = text
            links = ""
        else:
            text = '<a href="'+link+'">'+title + "</a>\n" + content
            textToPublish = text[:4080] + ' ...'
            textToPublish2 = '... '+ text[4081:]

        logging.info("text to "+ textToPublish)
        logging.info("text to 2"+ textToPublish2)

        bot.sendMessage('@'+channel, textToPublish, parse_mode='HTML') 
        if textToPublish2:
            try:
                bot.sendMessage('@'+channel, textToPublish2[:4090], parse_mode='HTML') 
            except:
                bot.sendMessage('@'+channel, "Text is longer", parse_mode='HTML') 
        if links:
            bot.sendMessage('@'+channel, links, parse_mode='HTML')

Пример #19

0

Показать файл

Файл: purifier.py Проект: ilyutoev/python-html-purifier

 def feed(self, data):
     """
     Main method for purifying HTML (overrided)
     """
     self.reset_purified()
     HTMLParser.feed(self, data)
     return self.html()

Пример #20

0

Показать файл

Файл: normalize.py Проект: BenedictC/cmark

 def __init__(self):
     HTMLParser.__init__(self)
     self.convert_charrefs = False
     self.last = "starttag"
     self.in_pre = False
     self.output = ""
     self.last_tag = ""

Пример #21

0

Показать файл

Файл: microformat.py Проект: curiousguy13/sugar

 def __init__(self, base_href):
     HTMLParser.__init__(self)
     self.base_href = base_href
     self.results = {}
     self.group_name = self.group_desc = None
     self.in_group_name = self.in_group_desc = self.in_activity = 0
     self._clear_info()

Пример #22

0

Показать файл

Файл: demonoid.py Проект: ATGardner/qBittorrent

 def __init__(self, url):
     HTMLParser.__init__(self)
     self.url = url
     self.current_item = None
     self.save_data = None
     self.seeds_leech = False
     self.size_repl = re_compile(",")

Пример #23

0

Показать файл

Файл: html_processor.py Проект: CenterForOpenScience/modular-file-renderer

    def __init__(self, zip_file):
        HTMLParser.__init__(self)
        self._html = StringIO()  # buffer for the processed HTML
        self._zip_file = zip_file

        # used to exclude the contents of script and object tags
        self._excl_nested_level = 0

Пример #24

0

Показать файл

Файл: __init__.py Проект: coldnight/fetchtitle

 def __init__(self):
   # use a list to store literal bytes and escaped Unicode
   if py3:
       super().__init__()
   else:
       HTMLParser.__init__(self)
   self.title = []

Пример #25

0

Показать файл

Файл: markup.py Проект: explosiveduck/ed2d

def parse_html_data(rootParser, htmlData):
    htmlParser = HTMLParser()
    root = rootParser(htmlParser, None, None, None)
    linedData = htmlData.split('\n')
    for line in linedData:
        htmlParser.feed(line.strip())
    return root

Пример #26

0

Показать файл

Файл: guardiana.py Проект: Kekun/badnik-tools

def get_game_list (system):
	"""List all the games on Guardiana for a given system."""
	
	response = urllib.request.urlopen ("http://www.guardiana.net/MDG-Database/Complete-List/" + system + "/")
	
	doc = response.read ()
	
	soup = BeautifulSoup(doc)
	html_game_list = soup.find("div", {"id": "MDGD_FullList_Box"})
	
	game_list = re.findall ("""» <a href="(.+?)">(.+?)</a><br/>(?:<em>)?(.*?)(?:</em>)?<br/>""", str (html_game_list))
	
	game_dict_list = []
	
	for game in game_list:
		game_dict = {'url': "http://www.guardiana.net" + game[0], 'title': [ ]}
		
		# Clean up the URL and add it
		result = re.search ("(.*?)\?PHPSESSID=.*?", game[0])
		if result:
			game_dict['url'] = "http://www.guardiana.net" + result.group(1)
		else:
			game_dict['url'] = "http://www.guardiana.net" + game[0]
		
		# Unescape the HTML entities from titles and add them
		pars = HTMLParser()
		game_dict['title'].append (pars.unescape (game[1]))
		game_dict_list.append (game_dict)
	
	return game_dict_list

Пример #27

0

Показать файл

Файл: links.py Проект: IsmaeRLGV/pyCoBot

    def linksh(self, cli, ev):
        try:
            self.chancache[ev.target.lower()]
        except:
            return 1
        if self.yt is True:
            yr = re.compile(".*(youtube\.com\/watch\?.*v=|youtu\.be\/)([A-Za-z"
                                                    "0-9._%-]*)[&\w;=\+_\-]*.*")
            res = yr.search(ev.arguments[0])
            if res is not None:
                self.ytlinks(cli, ev, res)
                return 0
        url = re.compile("((https?):((\/\/)|(\\\\))+[\w\d:#@%\/;$()~_?\+-=\\\."
                                                                        "&]*)")
        res = url.search(ev.arguments[0])
        if res is None:
            return 1
        uri = res.group(1)
        r = urllib.request.urlopen(uri).read().decode('utf-8', 'replace')
        parser = HTMLParser()
        r = parser.unescape(r)
        yr = re.compile(".*<title[^>]*>([^<]+)</title>.*")
        title = yr.search(r)
        if title is None:
            return 1

        cli.msg(ev.target, title.group(1))

Пример #28

0

Показать файл

Файл: imagesMutliThreadCrawler.py Проект: Hearen/Scrawlers

	def __init__(self):
		self.urlList = []
		self.index = 0
		self.nextUrl = ''
		self.tagList = ['li','a']
		self.classList = ['photo-list-padding','pic']
		HTMLParser.__init__(self)

Пример #29

0

Показать файл

Файл: scholar.py Проект: noahadler/scholar.py

    def _parse_article(self, div):
        self.article = Article()

        parser = HTMLParser()

        for tag in div:
            if not hasattr(tag, 'name'):
                continue
            if tag.name == 'div' and self._tag_has_class(tag, 'gs_ri'):
                rt = tag.find('h3', {'class': 'gs_rt'})
                if rt:
                    ctu = rt.find('span')
                    if ctu:
                      ctu.extract()
                    self.article['title'] = parser.unescape(''.join(rt.findAll(text=True)).strip())
                    if rt.a:
                      self.article['url'] = self._path2url(rt.a['href'])

                if tag.find('div', {'class': 'gs_a'}):
                    year = self.year_re.findall(tag.find('div', {'class': 'gs_a'}).text)
                    self.article['year'] = year[0] if len(year) > 0 else None

                if tag.find('div', {'class': 'gs_fl'}):
                    self._parse_links(tag.find('div', {'class': 'gs_fl'}))

                if tag.find('div', {'class': 'gs_rs'}):
                    self.article['summary'] = tag.find('div', {'class': 'gs_rs'}).text

        if self.article['title']:
            self.handle_article(self.article)

Пример #30

0

Показать файл

Файл: torrentz.py Проект: ATGardner/qBittorrent

 def __init__(self, results, url, trackers):
     HTMLParser.__init__(self)
     self.results = results
     self.url = url
     self.trackers = trackers
     self.td_counter = None
     self.current_item = None

Пример #31

0

Показать файл

Файл: player.py Проект: LelouchLamperougeVI/vimtube

 def __init__(self, html=None):
     HTMLParser.__init__(self)
     self.results = list()
     if html != None:
         self.feed(html)

Пример #32

0

Показать файл

 def __init__(self):
     HTMLParser.__init__(self)
     self._level = 0
     self._last = ''
     self._in_code = False
     self._prettified = [BASE_HTML_HEADER]

Пример #33

0

Показать файл

 def __init__(self):
     HTMLParser.__init__(self)
     self.result = []

Пример #34

0

Показать файл

    def __init__(self):

        HTMLParser.__init__(self)
        self._buf = []
        self.hide_output = False

Пример #35

0

Показать файл

Файл: _parse_info.py Проект: tanykazy/comment_client

 def __init__(self, logger=None, callback=None):
     HTMLParser.__init__(self)
     self.__logger = logger
     self.__stack = list()
     self.__data = list()
     self.__callback = callback

Пример #36

0

Показать файл

 def __init__(self):
     HTMLParser.__init__(self)
     self.root = None
     self._stack = []
     self._texting = None
     self._tailing = None

Пример #37

0

Показать файл

Файл: __init__.py Проект: hlovdal/OctoPrint

 def __init__(self, **kw):
     HTMLParser.__init__(self, **kw)
     self._fed = []

Пример #38

0

Показать файл

Файл: ruten.py Проект: cheneddie/python3_test

 def __init__(self):
     HTMLParser.__init__(self)
     self.a_flag = True
     self.elements = []

Пример #39

0

Показать файл

Файл: xss_filter.py Проект: shlee7131/skku-coding-platform

 def __init__(self, allows=[]):
     HTMLParser.__init__(self)
     self.allow_tags = allows if allows else self.allow_tags
     self.result = []
     self.start = []
     self.data = []

Пример #40

0

Показать файл

 def __init__(self, results, url, *args):
     HTMLParser.__init__(self)
     self.url = url
     self.td_counter = None
     self.current_item = None
     self.results = results

Пример #41

0

Показать файл

Файл: feedcore.py Проект: hiancdtrsnm/gpodder

 def __init__(self, base):
     HTMLParser.__init__(self)
     self._base = base
     self._resolved_url = None

Пример #42

0

Показать файл

Файл: autorss.py Проект: tianhm/riko

 def reset(self):
     HTMLParser.reset(self)
     self.entry = iter([])

Пример #43

0

Показать файл

Файл: rollparser.py Проект: Falven/CSGDAnalysis

 def __init__(self):
     HTMLParser.__init__(self)
     self.inResult = False
     self.rolls = []

Пример #44

0

Показать файл

 def __init__(self):
     self.buffering = False
     self.buf = ''
     self.dockerfile = ''
     HTMLParser.__init__(self)

Пример #45

0

Показать файл

def process_html(content):
    html_parser = HTMLParser()
    result = html_parser.unescape(content)
    return result

Пример #46

0

Показать файл

 def __init__(self):
     HTMLParser.__init__(self)
     self._cur_tag = ""
     self.title = ""

Пример #47

0

Показать файл

Файл: scrapper.py Проект: garetroy/schoolcode

 def __init__(self):
     HTMLParser.__init__(self)
     self._wanted = False
     self._current_game = None
     self._games=[]

Пример #48

0

Показать файл

Файл: ll_oneday_players.py Проект: theGrue/llama_slobber

 def __init__(self, htype):
     HTMLParser.__init__(self)
     self.result = []
     self.this_llama = []
     self.scan = False
     self.htype = htype

Пример #49

0

Показать файл

Файл: ll_rundle_members.py Проект: theGrue/llama_slobber

 def __init__(self):
     HTMLParser.__init__(self)
     self.lname = ''
     self.notlogin = False
     self.result = []

Пример #50

0

Показать файл

Файл: actions.py Проект: OpenGov-OpenData/ckanext-pages

 def __init__(self):
     HTMLParser.__init__(self)
     self.first_image = None

Пример #51

0

Показать файл

Файл: rdf.py Проект: rfResearch/udata

 def __init__(self, *args, **kwargs):
     HTMLParser.__init__(self, *args, **kwargs)
     self.elements = set()

Python HTMLParser примеры использования