def core_course_get_files(self, username, password, course_id, section): page = HTMLParser( self.session.get("http://moodle.fbtuit.uz/course/view.php?id=" + str(course_id)).text) if page.css_first( 'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт": if not self.core_auth_confirm_user(username, password): return 0xff page = HTMLParser( self.session.get( "http://moodle.fbtuit.uz/course/view.php?id=" + str(course_id)).text) #li teglari oralig'ini tekshiradi, mavzular va fayllar o'sha yerda for tag in page.tags('li'): if 'id' in tag.attributes: if tag.attributes['id'] == 'section-' + str(section): page = HTMLParser(tag.html) #aaaaaaa break links = [] for tag in page.tags('a'): if not (tag.attributes['href'] in links): links.append(tag.attributes['href']) try: os.mkdir(os.getcwd() + "/temp") except Exception as e: pass for link in links: if not ('resource' in link): continue resp = self.session.get(link, allow_redirects=True) file_name = resp.url[resp.url.rfind("/") + 1:] file_name = unquote(file_name) if 'view.php' in file_name: continue with open(os.getcwd() + "/temp/" + file_name, 'wb') as file: file.write(resp.content) return 0
def core_course_get_tasks(self, username, password, course_id): #Topshiriq mavzularini ko'rsatadi "Topshiriqlar list shaklida qaytadi, masalan: [1-dedline 6280, ..., vazifa uning_idsi]" page = HTMLParser( self.session.get("http://moodle.fbtuit.uz/course/view.php?id=" + str(course_id)).text) if page.css_first( 'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт": if not self.core_auth_confirm_user(username, password): return [] page = HTMLParser( self.session.get( "http://moodle.fbtuit.uz/course/view.php?id=" + str(course_id)).text) tasks = [] counter = 1 ids = [] #Yana li tegiga murojaat qilamiz, chunki topshiriqlar sectionlarni ichida berilgan bo'ladi #if lar ko'payib ketarkan, optimal yo'lini qidirish kerak for tag in page.tags('li'): if 'id' in tag.attributes: if tag.attributes['id'][:7] == 'section': if not ('http://moodle.fbtuit.uz/mod/assign/view.php?' in tag.html): continue section = HTMLParser(tag.html) theme = section.css_first('span').text() + '\n' for tag1 in section.tags('a'): if not ('http://moodle.fbtuit.uz/mod/assign/view.php?' in tag1.attributes['href']): continue #if 'section' in tag1.attributes['id']: continue #tasks.append("├"+str(counter)+". "+tag1.text().replace("\n"," ")+" "+tag1.css_first('input').attributes['value']) #tasks.append("├"+str(counter)+". "+tag1.css_first('span').text()+" "+tag1.css_first('input').attributes['value']) if tag1.attributes['href'][tag1.attributes['href']. rfind("=") + 1:] in ids: continue tasks.append(theme + "├" + str(counter) + ". " + tag1.text() + " " + tag1.attributes['href'] [tag1.attributes['href'].rfind("=") + 1:]) ids.append(tag1.attributes['href'] [tag1.attributes['href'].rfind("=") + 1:]) counter += 1 theme = '' '''s = '' for input_tag in tag1.css('input'): if input_tag.attributes['name']!='modulename' and input_tag.attributes['name']!='id': continue s = input_tag.attributes['value']+" "+s tasks[-1]+=s''' tasks[-1] = tasks[-1].replace("├", "└") if tasks == []: return ["Bu yerda topshiriqlar yo'q :)"] return tasks
def parseReferencesSelectolax(text): start = time.time() html_parser = HTMLParser(text) links = [tag.attrs['href'] for tag in html_parser.tags('a') if 'href' in tag.attrs] end = time.time() duration = end - start print(f"Python Selectolax\t\t\tDuration: {duration:.9f}\t Found: {len(links)}")
def core_course_get_contents(self, username, password, course_id): "Kursdagi mavzularni ko'rsatish uchun govno kod" page = HTMLParser( self.session.get("http://moodle.fbtuit.uz/course/view.php?id=" + str(course_id)).text) if page.css_first( 'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт": if not self.core_auth_confirm_user(username, password): return [] page = HTMLParser( self.session.get( "http://moodle.fbtuit.uz/course/view.php?id=" + str(course_id)).text) counter = 1 contents = [] "Kursdagi barcha mavzularni olamiz" ''' >>> s='' >>> for node in page.css("li"): ... if 'id' in node.attributes: ... if node.attributes['id'][:7]=='section' and node.attributes['id']!='sectio n-0': print(node.child.css_first("span").text()); s+= node.child.css_first('span').text() ''' for tag in page.tags("li"): if 'id' in tag.attributes: if tag.attributes['id'][:7] == "section" and tag.attributes[ 'id'] != "section-0": if not ('resource' in tag.html): continue section = HTMLParser(tag.html) contents.append( str(counter) + ". " + section.css_first("span").text()) counter += 1 if contents == []: contents = ["Bu yerda yuklanadigan hech narsa yo'q :/"] return contents
def core_task_get_info(self, username, password, task_id): page = HTMLParser( self.session.get( 'http://moodle.fbtuit.uz/mod/assign/view.php?id=' + str(task_id) + "&lang=uz").text) if page.css_first( 'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт": if not self.core_auth_confirm_user(username, password): return {} page = HTMLParser( self.session.get( 'http://moodle.fbtuit.uz/mod/assign/view.php?id=' + str(task_id) + "&lang=uz").text) infos = {'task': ''} for div in page.css("div"): if div.attributes.get("id", '') == "page": infos['course_name'] = div.css_first("h1").text() links = [] submitted_files = [] page.unwrap_tags(['p']) for tag in page.tags("div"): if 'id' in tag.attributes: if tag.attributes['id'] == 'intro': infos['task'] = ' '.join(tag.text().split()) for css in tag.css('a'): links.append(css.attributes['href'] ) #Taskka ilova qilingan fayllar counter = 0 info = '' for tag in page.tags('td'): if 11 < counter < 14: counter += 1 continue info += tag.text() if counter % 2: info += "😁" else: info += ": " counter += 1 if tag.css_first('a'): for css in tag.css('a'): if "submission_files" in css.attributes['href']: submitted_files.append(css.attributes['href']) if counter == 3: if "Urinish bo'lmagan" in info: break infos['info'] = ' '.join(info.split()) infos['task_files'] = len(links) > 0 infos['submitted_files'] = len(submitted_files) > 0 return infos
def test_decompose(): html = "<body><div><p id='p3'>text</p></div></body>" html_parser = HTMLParser(html) for node in html_parser.tags('p'): node.decompose() assert html_parser.body.child.html == '<div></div>'
def extractLinksHtmlParser(txt): lst = [] dom = HTMLParser(txt) for tag in dom.tags('a'): attrs = tag.attributes if 'href' in attrs: lst.append(attrs['href']) return lst
def core_task_get_files(self, username, password, task_id, submission): page = HTMLParser( self.session.get( f"http://moodle.fbtuit.uz/mod/assign/view.php?id={task_id}"). text) if page.css_first( 'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт": if not self.core_auth_confirm_user(username, password): return 0xff page = HTMLParser( self.session.get( f"http://moodle.fbtuit.uz/mod/assign/view.php?id={task_id}" ).text) #submission True bo'lsa unda talaba tomonidan yuklangan fayllarni olamiz, bo'lmasa o'qituvchi yuklaganni #Diqqat! O'qilmaydigan qatorlar! x( links = [] if submission: '''for tag in page.tags("div"): if 'id' in tag.attributes: if tag.attributes['id']=='intro': for css in tag.css('a'): links.append(css.attributes['href'])#Taskka ilova qilingan fayllar''' for tag in page.tags("a"): if 'mod_assign' in tag.attributes['href']: links.append(tag.attributes['href']) else: '''for tag in page.tags('tr'): if tag.css_first('a'): for css in tag.css('a'): if "submission_files" in css.attributes['href']: links.append(css.attributes['href'])''' for tag in page.tags('a'): if "submission_files" in tag.attributes['href']: links.append(tag.attributes['href']) #O'qilmaydigan qatorlar tugadi try: os.mkdir("tasks") except Exception as e: pass for url in links: file_name = url[url.rfind("/") + 1:url.rfind("?")] file_name = unquote(file_name) file = open(os.getcwd() + "/tasks/" + file_name, 'wb') file.write(self.session.get(url).content) file.close() return 0
def get_page_elements(html): tree = HTMLParser(html) if tree.body is None: return None for tag in tree.css('script'): tag.decompose() for tag in tree.css('style'): tag.decompose() titles = tree.tags('title') if len(titles)>0: title = titles[0].text() else: title = '' h_refs = [] a_tags = tree.tags('a') for a_tag in a_tags: tag = a_tag.attributes if 'href' in tag: tmp = {'href': tag['href']} if 'title' in tag: tmp['title'] = tag['title'] h_refs.append(tmp) meta = {} meta_tags = tree.tags('meta') for meta_tag in meta_tags: attributes = meta_tag.attributes if 'property' in attributes: if 'content' in attributes: property = attributes['property'] if (property is not None) and (type(property)==str): if 'tag' in property: if attributes['content'] is not None: meta['tags'] = attributes['content'].split(',') if 'keyword' in property: if attributes['content'] is not None: meta['keywords'] = attributes['content'].split(',') tags = ['style', 'script', 'xmp', 'iframe', 'noembed', 'noframes'] extracted_text = remove_junk(tree.body.text(separator=' ')) rtn = { 'title':title , 'h_refs':h_refs, 'meta':meta, 'extracted_text':extracted_text} return rtn
def extract(content): dom = HTMLParser(content) # Create a parser for tag in dom.tags('tr'): # For every tr tds = [] for td in tag.iter(): tds.append(td.text()) # Add the tds ---> EXPLOIT-DB:1 | CVE_NR_1 if len(tds) == 2 and tds[0].startswith('EXPLOIT-DB'): obj = {"filename": tds[0].split(':')[1], "cve": tds[1].strip()} # Add the object to the database collection.update({"filename": tds[0].split(':')[1]}, obj, upsert=True)
def extractlink(s): l = [] dom = HTMLParser(s) for tag in dom.tags("a"): attrs = tag.attributes if "href" in attrs: if str(attrs["href"]).startswith(r"/"): l.append(k + attrs["href"]) else: if attrs["href"].startswith(r"http"): l.append(attrs["href"]) elif attrs["href"]: l.append(k + r"/" + attrs["href"]) return l
def extract(content): links = [] dom = HTMLParser(content) for tag in dom.tags('a'): attrs = tag.attributes if 'href' in attrs and attrs['href'] is not None: if '#' not in attrs['href']: links.append(attrs['href']) for tag in dom.tags('link'): attrs = tag.attributes if 'href' in attrs and attrs['href'] is not None and 'rel' in attrs and 'stylesheet' not in attrs['rel']: if '#' not in attrs['href']: links.append(attrs['href']) for tag in dom.tags('meta'): attrs = tag.attributes if 'href' in attrs and attrs['href'] is not None: if '#' not in attrs['href']: links.append(attrs['href']) # for sitemaps for tag in dom.tags('loc'): links.append(tag.text()) links = list(set(links)) return links
def parse(self): """ Scrapes website for recipe links. """ # how recipe links should look like in regex pattern = r'.*foodnetwork\.com/recipes/.*\d{7}' # list or recipes are organized alphabetically on website, # so just append letters to base link. page_suffix = list(string.ascii_lowercase) page_suffix.append('123') page_suffix.append('xyz') for suffix in page_suffix: response = requests.get(self.base_link + suffix) parser = HTMLParser(response.text) anchors_nodes = parser.tags('a') for anchor_node in anchors_nodes: link = anchor_node.attributes[ 'href'] if 'href' in anchor_node.attributes else '' if re.fullmatch(pattern, link): self.links.add('http:' + link)
def index_html(url): r = session.get(url) tree = HTMLParser(r.text) #Remove non-text "text" content. for tag in tree.css('script'): tag.decompose() for tag in tree.css('style'): tag.decompose() links = [i.attrs.get('href', None) for i in tree.tags("a")] links = [ requests.compat.urljoin(url, i).split("#")[0].split('?')[0] for i in links ] text = tree.body.text(separator='\n') title = tree.css_first("title") if title: title = title.text() title = title or url entry = dict(url=url, links=links, body=text, last_indexed=datetime.datetime.utcnow(), title=title) return entry
def test_malformed_attributes(): html = '<div> <meta name="description" content="ÐаÑ"Ð " /></div>' html_parser = HTMLParser(html) for tag in html_parser.tags('meta'): assert tag
def extract_infos(headers, content): data = dict() headers = {k.lower(): v for k, v in headers.items()} # check wp version wp_version = re.findall( r'wp-(?:emoji-release|embed)\.min\.js.*ver=(.*?)[\"\']', content) if wp_version: wp_version = wp_version[0] cms = 'Default' version = 'version' dom = HTMLParser(content) for tag in dom.tags('meta'): attrs = tag.attributes if 'name' in attrs: if 'generator' == attrs['name'].lower(): cms = attrs['content'] version = re.findall(r'\d+\.*\d*\.*\d*', cms) if version: version = version[0] cms = re.sub(re.escape(version), '', cms).strip() if cms == 'Default': if 'x-powered-by' in headers.keys(): cms = headers.get('x-powered-by') if 'x-aspnet-version' in headers.keys(): version = headers.get('x-aspnet-version') elif 'magento' in content.lower(): cms = 'Magento' elif 'shopify' in content.lower(): cms = 'Shopify' elif 'squarespace' in content.lower(): cms = 'Squarespace' elif 'blogger.com' in content.lower(): cms = 'Blogger' elif 'typo3' in content.lower(): cms = 'TYPO3' elif 'opencart' in content.lower(): cms = 'OpenCart' elif 'joomla' in content.lower(): cms = 'Joomla' elif 'prestashop' in content.lower(): cms = 'Prestashop' elif 'wordpress' in content.lower(): cms = 'Wordpress' elif 'drupal' in content.lower(): cms = 'Drupal' data['cms'] = cms if wp_version: data['version'] = wp_version else: data['version'] = version for key in headers.keys(): if 'server' == key or 'x-server' == key: data['server'] = headers.get(key) if key.startswith('x-') and headers.get(key) not in data.values(): data[key] = headers.get(key) plugins = re.findall(r'wp-content/plugins/(.*?)/.*ver=(.*?)[\s\'\"]', content) if plugins: data = append_info(plugins, data, 'Plugins') wp_themes = re.findall(r'/wp-content/themes/(.*)/.*?ver=(.*?)[\s\'\"]', content) if wp_themes: data = append_info(wp_themes, data, 'Themes') drupal_modules = re.findall(r'/modules/.*/(.*?)\.css\?v=(.*?)[\s\"\']', content) if drupal_modules: data = append_info(drupal_modules, data, 'Plugins') drupal_themes = re.findall(r'/themes/.*?/(.*)/css.*?v=(.*?)[\s\'\"]', content) if drupal_themes: data = append_info(drupal_themes, data, 'Themes') return data