Python HTMLParser.tags示例，selectolax.parser.HTMLParser.tags Python示例

示例#1

0

显示文件

 def core_course_get_files(self, username, password, course_id, section):
     page = HTMLParser(
         self.session.get("http://moodle.fbtuit.uz/course/view.php?id=" +
                          str(course_id)).text)
     if page.css_first(
             'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт":
         if not self.core_auth_confirm_user(username, password):
             return 0xff
         page = HTMLParser(
             self.session.get(
                 "http://moodle.fbtuit.uz/course/view.php?id=" +
                 str(course_id)).text)
     #li teglari oralig'ini tekshiradi, mavzular va fayllar o'sha yerda
     for tag in page.tags('li'):
         if 'id' in tag.attributes:
             if tag.attributes['id'] == 'section-' + str(section):
                 page = HTMLParser(tag.html)  #aaaaaaa
                 break
     links = []
     for tag in page.tags('a'):
         if not (tag.attributes['href'] in links):
             links.append(tag.attributes['href'])
     try:
         os.mkdir(os.getcwd() + "/temp")
     except Exception as e:
         pass
     for link in links:
         if not ('resource' in link): continue
         resp = self.session.get(link, allow_redirects=True)
         file_name = resp.url[resp.url.rfind("/") + 1:]
         file_name = unquote(file_name)
         if 'view.php' in file_name: continue
         with open(os.getcwd() + "/temp/" + file_name, 'wb') as file:
             file.write(resp.content)
     return 0

示例#2

0

显示文件

    def core_course_get_tasks(self, username, password,
                              course_id):  #Topshiriq mavzularini ko'rsatadi
        "Topshiriqlar list shaklida qaytadi, masalan: [1-dedline 6280, ..., vazifa uning_idsi]"
        page = HTMLParser(
            self.session.get("http://moodle.fbtuit.uz/course/view.php?id=" +
                             str(course_id)).text)
        if page.css_first(
                'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт":
            if not self.core_auth_confirm_user(username, password):
                return []
            page = HTMLParser(
                self.session.get(
                    "http://moodle.fbtuit.uz/course/view.php?id=" +
                    str(course_id)).text)
        tasks = []
        counter = 1
        ids = []
        #Yana li tegiga murojaat qilamiz, chunki topshiriqlar sectionlarni ichida berilgan bo'ladi
        #if lar ko'payib ketarkan, optimal yo'lini qidirish kerak
        for tag in page.tags('li'):
            if 'id' in tag.attributes:
                if tag.attributes['id'][:7] == 'section':
                    if not ('http://moodle.fbtuit.uz/mod/assign/view.php?'
                            in tag.html):
                        continue
                    section = HTMLParser(tag.html)
                    theme = section.css_first('span').text() + '\n'
                    for tag1 in section.tags('a'):
                        if not ('http://moodle.fbtuit.uz/mod/assign/view.php?'
                                in tag1.attributes['href']):
                            continue
                        #if 'section' in tag1.attributes['id']: continue
                        #tasks.append("├"+str(counter)+". "+tag1.text().replace("\n"," ")+" "+tag1.css_first('input').attributes['value'])
                        #tasks.append("├"+str(counter)+". "+tag1.css_first('span').text()+" "+tag1.css_first('input').attributes['value'])
                        if tag1.attributes['href'][tag1.attributes['href'].
                                                   rfind("=") + 1:] in ids:
                            continue
                        tasks.append(theme + "├" + str(counter) + ". " +
                                     tag1.text() + " " +
                                     tag1.attributes['href']
                                     [tag1.attributes['href'].rfind("=") + 1:])
                        ids.append(tag1.attributes['href']
                                   [tag1.attributes['href'].rfind("=") + 1:])
                        counter += 1
                        theme = ''
                        '''s = ''
						for input_tag in tag1.css('input'):
							if input_tag.attributes['name']!='modulename' and input_tag.attributes['name']!='id': continue
							s = input_tag.attributes['value']+" "+s
						tasks[-1]+=s'''

                    tasks[-1] = tasks[-1].replace("├", "└")

        if tasks == []: return ["Bu yerda topshiriqlar yo'q :)"]
        return tasks

示例#3

0

显示文件

文件： xml.py 项目： Gumija/FindLinks

def parseReferencesSelectolax(text):
    start = time.time()    
    html_parser = HTMLParser(text)
    links = [tag.attrs['href'] for tag in html_parser.tags('a') if 'href' in tag.attrs]
    end = time.time()
    duration = end - start
    print(f"Python Selectolax\t\t\tDuration: {duration:.9f}\t Found: {len(links)}")

示例#4

0

显示文件

    def core_course_get_contents(self, username, password, course_id):
        "Kursdagi mavzularni ko'rsatish uchun govno kod"
        page = HTMLParser(
            self.session.get("http://moodle.fbtuit.uz/course/view.php?id=" +
                             str(course_id)).text)
        if page.css_first(
                'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт":
            if not self.core_auth_confirm_user(username, password):
                return []
            page = HTMLParser(
                self.session.get(
                    "http://moodle.fbtuit.uz/course/view.php?id=" +
                    str(course_id)).text)
        counter = 1
        contents = []
        "Kursdagi barcha mavzularni olamiz"
        '''
		>>> s=''
		>>> for node in page.css("li"):
		...  if 'id' in node.attributes:
		...   if node.attributes['id'][:7]=='section' and node.attributes['id']!='sectio
		n-0': print(node.child.css_first("span").text()); s+= node.child.css_first('span').text()
		'''
        for tag in page.tags("li"):
            if 'id' in tag.attributes:
                if tag.attributes['id'][:7] == "section" and tag.attributes[
                        'id'] != "section-0":
                    if not ('resource' in tag.html): continue
                    section = HTMLParser(tag.html)
                    contents.append(
                        str(counter) + ". " + section.css_first("span").text())
                    counter += 1
        if contents == []:
            contents = ["Bu yerda yuklanadigan hech narsa yo'q :/"]
        return contents

示例#5

0

显示文件

 def core_task_get_info(self, username, password, task_id):
     page = HTMLParser(
         self.session.get(
             'http://moodle.fbtuit.uz/mod/assign/view.php?id=' +
             str(task_id) + "&lang=uz").text)
     if page.css_first(
             'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт":
         if not self.core_auth_confirm_user(username, password):
             return {}
         page = HTMLParser(
             self.session.get(
                 'http://moodle.fbtuit.uz/mod/assign/view.php?id=' +
                 str(task_id) + "&lang=uz").text)
     infos = {'task': ''}
     for div in page.css("div"):
         if div.attributes.get("id", '') == "page":
             infos['course_name'] = div.css_first("h1").text()
     links = []
     submitted_files = []
     page.unwrap_tags(['p'])
     for tag in page.tags("div"):
         if 'id' in tag.attributes:
             if tag.attributes['id'] == 'intro':
                 infos['task'] = ' '.join(tag.text().split())
                 for css in tag.css('a'):
                     links.append(css.attributes['href']
                                  )  #Taskka ilova qilingan fayllar
     counter = 0
     info = ''
     for tag in page.tags('td'):
         if 11 < counter < 14:
             counter += 1
             continue
         info += tag.text()
         if counter % 2: info += "😁"
         else: info += ": "
         counter += 1
         if tag.css_first('a'):
             for css in tag.css('a'):
                 if "submission_files" in css.attributes['href']:
                     submitted_files.append(css.attributes['href'])
         if counter == 3:
             if "Urinish bo'lmagan" in info: break
     infos['info'] = ' '.join(info.split())
     infos['task_files'] = len(links) > 0
     infos['submitted_files'] = len(submitted_files) > 0
     return infos

示例#6

0

显示文件

文件： test_nodes.py 项目： pappakrishnan/selectolax

def test_decompose():
    html = "<body><div><p id='p3'>text</p></div></body>"
    html_parser = HTMLParser(html)

    for node in html_parser.tags('p'):
        node.decompose()

    assert html_parser.body.child.html == '<div></div>'

示例#7

0

显示文件

def extractLinksHtmlParser(txt):
    lst = []
    dom = HTMLParser(txt)
    for tag in dom.tags('a'):
        attrs = tag.attributes
        if 'href' in attrs:
            lst.append(attrs['href'])
    return lst

示例#8

0

显示文件

    def core_task_get_files(self, username, password, task_id, submission):
        page = HTMLParser(
            self.session.get(
                f"http://moodle.fbtuit.uz/mod/assign/view.php?id={task_id}").
            text)
        if page.css_first(
                'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт":
            if not self.core_auth_confirm_user(username, password):
                return 0xff
            page = HTMLParser(
                self.session.get(
                    f"http://moodle.fbtuit.uz/mod/assign/view.php?id={task_id}"
                ).text)
        #submission True bo'lsa unda talaba tomonidan yuklangan fayllarni olamiz, bo'lmasa o'qituvchi yuklaganni
        #Diqqat! O'qilmaydigan qatorlar! x(
        links = []
        if submission:
            '''for tag in page.tags("div"):
				if 'id' in tag.attributes:
					if tag.attributes['id']=='intro':
						for css in tag.css('a'):
							links.append(css.attributes['href'])#Taskka ilova qilingan fayllar'''
            for tag in page.tags("a"):
                if 'mod_assign' in tag.attributes['href']:
                    links.append(tag.attributes['href'])
        else:
            '''for tag in page.tags('tr'):
				if tag.css_first('a'):
					for css in tag.css('a'):
						if "submission_files" in css.attributes['href']: links.append(css.attributes['href'])'''
            for tag in page.tags('a'):
                if "submission_files" in tag.attributes['href']:
                    links.append(tag.attributes['href'])
        #O'qilmaydigan qatorlar tugadi
        try:
            os.mkdir("tasks")
        except Exception as e:
            pass
        for url in links:
            file_name = url[url.rfind("/") + 1:url.rfind("?")]
            file_name = unquote(file_name)
            file = open(os.getcwd() + "/tasks/" + file_name, 'wb')
            file.write(self.session.get(url).content)
            file.close()
        return 0

示例#9

0

显示文件

文件： test_predictions.py 项目： isabella232/nlp-pipelines

def get_page_elements(html):
    tree = HTMLParser(html)
    if tree.body is None:
        return None

    for tag in tree.css('script'):
        tag.decompose()
    for tag in tree.css('style'):
        tag.decompose()

    titles = tree.tags('title')
    if len(titles)>0:
        title = titles[0].text()
    else:
        title = ''
    h_refs = []
    a_tags = tree.tags('a')
    for a_tag in a_tags:
        tag = a_tag.attributes
        if 'href' in tag:
            tmp = {'href': tag['href']}
            if 'title' in tag:
                tmp['title'] = tag['title']
            h_refs.append(tmp)
    meta = {}
    meta_tags = tree.tags('meta')
    for meta_tag in meta_tags:
        attributes = meta_tag.attributes
        if 'property' in attributes:
            if 'content' in attributes:
                property = attributes['property']
                if (property is not None) and (type(property)==str):
                    if 'tag' in property:
                        if attributes['content'] is not None:
                            meta['tags'] = attributes['content'].split(',') 
                    if 'keyword' in property:
                        if attributes['content'] is not None:
                            meta['keywords'] = attributes['content'].split(',')
    tags = ['style', 'script', 'xmp', 'iframe', 'noembed', 'noframes']
    extracted_text = remove_junk(tree.body.text(separator=' '))
    rtn = { 'title':title , 'h_refs':h_refs, 'meta':meta, 'extracted_text':extracted_text}
    return rtn

示例#10

0

显示文件

文件： scrape_refmap_mitre.py 项目： St3v3nsS/CVE_crawler

def extract(content):
    dom = HTMLParser(content)  # Create a parser
    for tag in dom.tags('tr'):  # For every tr
        tds = []
        for td in tag.iter():
            tds.append(td.text())  # Add the tds ---> EXPLOIT-DB:1 | CVE_NR_1
        if len(tds) == 2 and tds[0].startswith('EXPLOIT-DB'):
            obj = {"filename": tds[0].split(':')[1], "cve": tds[1].strip()}
            # Add the object to the database
            collection.update({"filename": tds[0].split(':')[1]},
                              obj,
                              upsert=True)

示例#11

0

显示文件

def extractlink(s):
    l = []
    dom = HTMLParser(s)
    for tag in dom.tags("a"):
        attrs = tag.attributes
        if "href" in attrs:
            if str(attrs["href"]).startswith(r"/"):
                l.append(k + attrs["href"])
            else:
                if attrs["href"].startswith(r"http"):
                    l.append(attrs["href"])
                elif attrs["href"]:
                    l.append(k + r"/" + attrs["href"])
    return l

示例#12

0

显示文件

文件： get_urls.py 项目： St3v3nsS/CVE_crawler

def extract(content):
    links = []
    dom = HTMLParser(content)
    for tag in dom.tags('a'):
        attrs = tag.attributes
        if 'href' in attrs and attrs['href'] is not None:
            if '#' not in attrs['href']:
                links.append(attrs['href'])
    for tag in dom.tags('link'):
        attrs = tag.attributes
        if 'href' in attrs and attrs['href'] is not None and 'rel' in attrs and 'stylesheet' not in attrs['rel']:
            if '#' not in attrs['href']:
                links.append(attrs['href'])
    for tag in dom.tags('meta'):
        attrs = tag.attributes
        if 'href' in attrs and attrs['href'] is not None:
            if '#' not in attrs['href']:
                links.append(attrs['href'])
    # for sitemaps
    for tag in dom.tags('loc'):
        links.append(tag.text())
    links = list(set(links))
    return links

示例#13

0

显示文件

文件： foodnetwork.py 项目： chn012/Food-For-Thought

    def parse(self):
        """ Scrapes website for recipe links. """

        # how recipe links should look like in regex
        pattern = r'.*foodnetwork\.com/recipes/.*\d{7}'

        # list or recipes are organized alphabetically on website,
        # so just append letters to base link.
        page_suffix = list(string.ascii_lowercase)
        page_suffix.append('123')
        page_suffix.append('xyz')
        for suffix in page_suffix:
            response = requests.get(self.base_link + suffix)
            parser = HTMLParser(response.text)
            anchors_nodes = parser.tags('a')
            for anchor_node in anchors_nodes:
                link = anchor_node.attributes[
                    'href'] if 'href' in anchor_node.attributes else ''
                if re.fullmatch(pattern, link):
                    self.links.add('http:' + link)

示例#14

0

显示文件

def index_html(url):
    r = session.get(url)
    tree = HTMLParser(r.text)
    #Remove non-text "text" content.
    for tag in tree.css('script'):
        tag.decompose()
    for tag in tree.css('style'):
        tag.decompose()
    links = [i.attrs.get('href', None) for i in tree.tags("a")]
    links = [
        requests.compat.urljoin(url, i).split("#")[0].split('?')[0]
        for i in links
    ]
    text = tree.body.text(separator='\n')
    title = tree.css_first("title")
    if title: title = title.text()
    title = title or url
    entry = dict(url=url,
                 links=links,
                 body=text,
                 last_indexed=datetime.datetime.utcnow(),
                 title=title)
    return entry

示例#15

0

显示文件

文件： test_nodes.py 项目： pushshift/selectolax

def test_malformed_attributes():
    html = '<div> <meta name="description" content="ÐÐ°Ñ"Ð " /></div>'
    html_parser = HTMLParser(html)

    for tag in html_parser.tags('meta'):
        assert tag

示例#16

0

显示文件

def extract_infos(headers, content):
    data = dict()
    headers = {k.lower(): v for k, v in headers.items()}

    # check wp version
    wp_version = re.findall(
        r'wp-(?:emoji-release|embed)\.min\.js.*ver=(.*?)[\"\']', content)
    if wp_version:
        wp_version = wp_version[0]

    cms = 'Default'
    version = 'version'

    dom = HTMLParser(content)
    for tag in dom.tags('meta'):
        attrs = tag.attributes
        if 'name' in attrs:
            if 'generator' == attrs['name'].lower():
                cms = attrs['content']
                version = re.findall(r'\d+\.*\d*\.*\d*', cms)
                if version:
                    version = version[0]
                cms = re.sub(re.escape(version), '', cms).strip()

    if cms == 'Default':
        if 'x-powered-by' in headers.keys():
            cms = headers.get('x-powered-by')
            if 'x-aspnet-version' in headers.keys():
                version = headers.get('x-aspnet-version')
        elif 'magento' in content.lower():
            cms = 'Magento'
        elif 'shopify' in content.lower():
            cms = 'Shopify'
        elif 'squarespace' in content.lower():
            cms = 'Squarespace'
        elif 'blogger.com' in content.lower():
            cms = 'Blogger'
        elif 'typo3' in content.lower():
            cms = 'TYPO3'
        elif 'opencart' in content.lower():
            cms = 'OpenCart'
        elif 'joomla' in content.lower():
            cms = 'Joomla'
        elif 'prestashop' in content.lower():
            cms = 'Prestashop'
        elif 'wordpress' in content.lower():
            cms = 'Wordpress'
        elif 'drupal' in content.lower():
            cms = 'Drupal'

    data['cms'] = cms
    if wp_version:
        data['version'] = wp_version
    else:
        data['version'] = version

    for key in headers.keys():
        if 'server' == key or 'x-server' == key:
            data['server'] = headers.get(key)
        if key.startswith('x-') and headers.get(key) not in data.values():
            data[key] = headers.get(key)

    plugins = re.findall(r'wp-content/plugins/(.*?)/.*ver=(.*?)[\s\'\"]',
                         content)
    if plugins:
        data = append_info(plugins, data, 'Plugins')
    wp_themes = re.findall(r'/wp-content/themes/(.*)/.*?ver=(.*?)[\s\'\"]',
                           content)
    if wp_themes:
        data = append_info(wp_themes, data, 'Themes')

    drupal_modules = re.findall(r'/modules/.*/(.*?)\.css\?v=(.*?)[\s\"\']',
                                content)
    if drupal_modules:
        data = append_info(drupal_modules, data, 'Plugins')

    drupal_themes = re.findall(r'/themes/.*?/(.*)/css.*?v=(.*?)[\s\'\"]',
                               content)
    if drupal_themes:
        data = append_info(drupal_themes, data, 'Themes')

    return data