def check_html_tags_and_attributes(self): """This function checks the indentation of lines in HTML files. Returns: TaskResult. A TaskResult object representing the result of the lint check. """ html_files_to_lint = self.html_filepaths failed = False error_messages = [] name = 'HTML tag and attribute' for filepath in html_files_to_lint: file_content = self.file_cache.read(filepath) file_lines = self.file_cache.readlines(filepath) parser = CustomHTMLParser(filepath, file_lines, self.debug) parser.feed(file_content) if len(parser.tag_stack) != 0: raise TagMismatchException('Error in file %s\n' % filepath) if parser.failed: error_messages.extend(parser.error_messages) failed = True return concurrent_task_utils.TaskResult( name, failed, error_messages, error_messages)
def fetch(url): if url.startswith("ipfs:/"): url = "https://ipfs.io/%s" % url[6:] elif url.startswith("fs:/"): url = "https://ipfs.io/%s" % url[4:] if url == "-": return sys.stdin.read() if url.startswith("http:") or url.startswith("https:"): response = urllib.request.urlopen(url) data = response.read().decode(errors='replace') content_type = response.headers['Content-Type'] if content_type and content_type.startswith('text/html'): parser = Parser() parser.feed(data) url = parser.url if not url: raise LoadError("""<link rel="alternate" type="application/asciicast+json" href="..."> not found in fetched HTML document""") return fetch(url) return data with open(url, 'r') as f: return f.read()
def download_problem(contest_uri, problem): problem_uri = contest_uri + '/problem/' + problem print('Retrieving', problem_uri, '...') sys.stdout.flush() problem_html = urllib.request.urlopen(problem_uri).read().decode('utf-8') print('Retrieved problem {} ({} bytes).'.format(problem, len(problem_html))) # Hack for codeforces HTML errors problem_html = problem_html.replace('<p</p>', '<p></p>') problem_html = problem_html.replace('<ul</ul>', '<ul></ul>') problem_html = problem_html.replace('<div class="sample-test"<', '<div class="sample-test"><') parser = ProblemHTMLParser() try: parser.feed(problem_html) except: print(problem_html, file=sys.stderr) raise examples = parser.getExamples() problem_dir = problem.lower() if not os.path.isdir(problem_dir): os.mkdir(problem_dir) for i, example in enumerate(examples, 1): input_path = os.path.join(problem_dir, 'in{}'.format(i)) with open(input_path, 'w') as f: f.write(example[0]) output_path = os.path.join(problem_dir, 'out{}'.format(i)) with open(output_path, 'w') as f: f.write(example[1]) print('Wrote {} examples for problem {}.'.format(len(examples), problem))
async def main(request): body = await request.read() secret = os.environ['GH_SECRET'] oauth_token = os.environ['GH_AUTH'] event = sansio.Event.from_http( request.headers, body, secret=secret ) async with aiohttp.ClientSession() as session: gh = gh_aiohttp.GitHubAPI(session, "asv-bot", oauth_token=oauth_token) await router.dispatch(event, gh) response = await session.get("https://pandas.pydata.org/speed") text = await response.text() parser = ProjectParser() parser.feed(text) projects = parser.projects # today = datetime.date.today() today = datetime.date(2019, 4, 5) futures = [ handle_regressions(project, gh, since=today) for project in projects ] await asyncio.gather(*futures) return web.Response(status=200)
def fetch(url): if url.startswith("ipfs:/"): url = "https://ipfs.io/%s" % url[6:] elif url.startswith("fs:/"): url = "https://ipfs.io/%s" % url[4:] if url == "-": return sys.stdin.read() if url.startswith("http:") or url.startswith("https:"): response = urllib.request.urlopen(url) data = response.read().decode(errors='replace') content_type = response.headers['Content-Type'] if content_type and content_type.startswith('text/html'): parser = Parser() parser.feed(data) url = parser.url if not url: raise LoadError( """<link rel="alternate" type="application/asciicast+json" href="..."> not found in fetched HTML document""" ) return fetch(url) return data with open(url, 'r') as f: return f.read()
def run(self): self.parent.visited+=[self.url] if self.gleb>0: parser = self.HParser(self.url) contents = "" try: response = urlopen(self.url) contents = response.read().decode('utf-8') except HTTPError: pass parser.feed(contents) parser.close() for v in parser.vals: #przeszukujemy podstrony if not v in self.parent.visited: thr = Probe.Thr(v,self.fun,self.gleb-1, self.parent) thr.start() self.parent.threads += [thr] self.children += [thr] self.fun(self.url) #uruchamiamy akcje na biezacej stronie #czekamy na watki potomne for t in self.children: t.join()
def web_bruter(self): while not self.password_q.empty() and not self.found: brute = self.password_q.get().rstrip() jar = cookielib.FileCookieJar("cookies") opener = urllib.request.build_opener( urllib.HTTPCookieProcessor(jar)) response = opener.open(self.target_get_form_url) page = response.read() print( f"[*] Trying {self.username} : {brute} ({self.password_q.qsize()} Left)" ) # parse out the hidden fields parser = self.parser_class() parser.feed(page) post_tags = parser.tag_results # add our username and password fields post_tags[self.username_field] = self.username post_tags[self.password_field] = brute login_data = urllib.parse.urlencode(post_tags) login_response = opener.open(self.target_post_form_url, login_data) login_result = login_response.read() if self.success_checker(login_result): self.found = True print("[*] Bruteforce Successful!") print(f"[*] Username : {self.username}") print(f"[*] Password : {brute}") print("[*] Waiting for other threads to exit!")
def fetch_article_contents(url): log_message('fetch_article_contents: `{}`', url) contents = urllib.request.urlopen(url) parser = ArticleParser() parser.feed(contents.read().decode('utf-8')) text = ''.join(['<p>{}</p>'.format(s) for s in parser.paragraphs]) return (contents.geturl(), text)
def main(argv=None): if argv is None: argv = sys.argv[1:] output_dir = pathlib.Path(argv[0]) input_dir = pathlib.Path(argv[1]) json_filepaths = input_dir.glob("*.json") for input_file in json_filepaths: parser = ImmunizeNVParser() slug = input_file.name.split(".", maxsplit=1)[0] output_file = output_dir / f"{slug}.parsed.ndjson" with open(input_file, "r") as in_fh: content = in_fh.read() html_data = extract_locator_data(content) parser.feed(html_data) with open(output_file, "w") as out_fh: parsed = parser.result for k in sorted(parsed.keys()): parsed[k]["id"] = generate_id(parsed[k]["title"]) line = json.dumps(parsed[k]) out_fh.write(line) out_fh.write("\n")
def identify_html_refs(path, get_ref): parser = LinkProcessor(get_ref, containing_component=get_component(path)) with path.open('r') as f: parser.feed(f.read()) with path.open('r') as f: parser.feed(f.read())
def clone_from_url(url, branch=None): page = urllib.request.urlopen(url) parser = HTMLParser() parser.feed(page.read().decode("utf-8")) title = parser.title if not title: raise RuntimeError("No title found for %s" % url) match = re.match(r"Issue\s+(\d+)\:\s+(.*?) - Python tracker", title) if match: number, name = match.groups() else: raise RuntimeError("No suitable title found for %s" % url) clone_name = clone_from_name("issue%s-%s" % (number, name), branch) # # Create a shortcut inside the new clone pointing to # the issue page in bugs.python.org # shortcut = pythoncom.CoCreateInstance ( shell.CLSID_InternetShortcut, None, pythoncom.CLSCTX_INPROC_SERVER, shell.IID_IUniformResourceLocator ) shortcut.SetURL(url) persist_file = shortcut.QueryInterface(pythoncom.IID_IPersistFile) path = os.path.abspath(os.path.join(clone_name, "issue%s.url" % number)) persist_file.Save(path, 0) return clone_name
def set_asciidoc_attributes(self, release_download_url): response = requests.get(release_download_url) if not response.ok: raise IOError( "Failed to retrieve download information from: {}".format( release_download_url)) parser = DownloadsHTMLParser() parser.feed(response.text) for version in parser.version_list: version_key = version.key # Add the latest version as "latest" as well as its version number. if parser.latest == version.release_number: self._asciidoc_attributes["latest"] = { "name": version.name, "date": version.date } self._asciidoc_attributes[version_key] = { "name": version.name, "date": version.date } self._asciidoc_attributes[ 'url_downloads_cassandra'] = release_download_url
def github_contrib(user, year): """ Get GitHub user daily contribution """ # Check for a cached version (file) filename = "github-{0}-{1}.html".format(user, year) if os.path.exists(filename): with open(filename) as file: contents = file.read() # Else get file from GitHub else: url = "https://github.com/users/{0}/contributions?to={1}-12-31" url = url.format(user, year) contents = str(urllib.request.urlopen(url).read()) with open(filename, "w") as file: file.write(contents) # Parse result (html) n = 1 + (date(year, 12, 31) - date(year, 1, 1)).days C = -np.ones(n, dtype=int) class HTMLParser(html.parser.HTMLParser): def handle_starttag(self, tag, attrs): if tag == "rect": data = {key: value for (key, value) in attrs} date = dateutil.parser.parse(data["data-date"]) count = int(data["data-count"]) day = date.timetuple().tm_yday - 1 if count > 0: C[day] = count parser = HTMLParser() parser.feed(contents) return C
def scrape(html_content): """ This function could have been a "private" utility but I left it "public" (note the quotation marks, as in Python there is no such thing as public or private access identifiers) because this scraping functionality could certainly be invoked passing raw HTML content, effectively circumventing the actual HTTP request. This has been useful, for instance, for unit testing this behavior. :param html_content: a string containing the actual HTML content. Cannot be None or empty. :return: a dictionary with two keys: "total" and "top5", containing the total number of elements and the count of the top 5 ones, respectively. """ if not html_content: raise ValueError('Input is empty') parser = HtmlElementsCounter() parser.feed(html_content) parser.close() # instructs the parser to consume the input entirely total = sum(parser.occurrences_by_tag.values()) # if the input only has N different elements (N < 5), this dictionary will hold exactly N entries top5_elements_with_occurrences = sorted(parser.occurrences_by_tag.items(), reverse=True, key=lambda x: x[1])[:5] return dict(total=total, top5=top5_elements_with_occurrences)
def fetch_card(card_name, dest_file=None): """ Gets card_name's image from magiccards.info and writes it to dest_file. @param card_name: the name of the magic card. @param dest_file: the file to write the image file to, defaulting to the current directory and card_name as file name. """ url = "http://magiccards.info/query?q=!{}&v=card&s=cname".format( urllib.parse.quote(card_name)) # Can raise URLError response = urllib.request.urlopen(url) #print(response.info()) file_contents = response.read().decode("utf-8") #with open("test.html", "w") as html_file: # html_file.write(file_contents) #file_contents = open("test.html", "r").read() parser = ImageTagFinder(card_name, strict=True) parser.feed(file_contents) print(parser.result_url) if parser.result_url is None: raise RuntimeError("Couldn't find image for \"{}\"".format(card_name)) image = urllib.request.urlopen(parser.result_url) with open( "{}.jpg".format(card_name.lower()) if dest_file is None else dest_file, "wb") as image_file: image_file.write(image.read())
def getentry(self): # Start with the entry from the parent. entry = FileHandler.getentry(self) parser = HTMLTitleParser() file = self.vfs.open(self.getselector(), "rt") try: while not parser.gotcompletetitle: line = file.readline() if not line: break parser.feed(line) parser.close() except html.parser.HTMLParseError: # Parse error? Stop parsing, go to here. We can still # return a title if the parse error happened after we got # the title. pass file.close() # OK, we've parsed the file and exited because of either an EOF # or a complete title (or error). Now, figure out what happened. if parser.gotcompletetitle: # Convert all whitespace sequences to a single space. # Removes newlines, tabs, etc. Good for presentation # and for security. title = re.sub('[\s]+', ' ', parser.titlestr) entry.setname(title) return entry
def xhamster_comment(link): class Parser(html.parser.HTMLParser): comments = [] in_comments_block = False in_comment = False def handle_starttag(self, tag, attrs): if tag == "div" and ("id", "commentList") in attrs: self.in_comments_block = True self.in_comment = self.in_comments_block and tag == "div" and ("class", "oh") in attrs def handle_endtag(self, tag): self.in_comment = False def handle_data(self, data): if not self.in_comment: return cleaned = data.replace("\r", "").replace("\n", "").replace("\\", "").strip() if cleaned.isprintable() and len(cleaned) > 0: self.comments.append(cleaned) def error(self, message): pass parser = Parser(convert_charrefs=True) try: response = requests.get(link, timeout=timeout, headers={"User-Agent": random.choice(user_agents)}) parser.feed(response.text) except: return "couldn't load comments :(" comments = sorted(parser.comments, key=lambda x: len(x), reverse=True) return comments[0] if len(comments) > 0 else "no comments :("
def test(client, page): response = client.get(page, follow_redirects=True) assert response.status_code == 200 parser = PageParser() parser.feed(response.data.decode(response.charset)) for url in parser.urls: requests.request("HEAD", url).raise_for_status()
def main(): import sys parser = MyHTMLParser() if sys.argv[1]: url = sys.argv[1] else: url = "http://instagram.com/sarah3llen" try: response = urllib.request.urlopen(url, timeout=5) contentType = response.getheader("Content-Type") if "text/html" in contentType: data = response.read() html = data.decode("utf-8") parser.feed(html) if parser.ogpTitle.__len__() > parser.title.__len__(): title = parser.ogpTitle else: title = parser.title.replace("\n", "") else: title = None except: title = None print(title)
def makeSampleHTTPRequest(url): #non-real time #resp = urllib.request.urlopen(url) #print(resp.read()) #practice with parsing an URL link = "https://docs.python.org/3.6/library/urllib.parse.html#module-urllib.parse" urlTuple = urllib.parse.urlparse(link) #print(urlTuple) #make a POST request to search for "vincent" link = "https://pythonprogramming.net/search" #build the values of the data: this case the query value #urllib.parse.urlencode will get an dict of key/values or sequence of tuple-2 (k,v) # as input to encode it to url format #The resulting string is a series of key=value pairs separated by '&' characters, # where both key and value are quoted using the quote_via function #then convert url format to byte or the type of server encode accepted values = {'q': 'robot'} data = urllib.parse.urlencode(values) data = data.encode( 'utf-8') #bytes now, depending on server's accepting encode #make a Request obj req = urllib.request.Request(link, data, method='POST') with urllib.request.urlopen( req) as resp: #get response obj, catching exception parser = MyHTMLParser() text = resp.read() #parser.feed('<html><head><title>Test</title></head> <body><h1>Parse me!</h1></body></html>') parser.feed(text.decode()) #print(pars) #print(resp.read()) return None
def count(filename): if not HtmlWordCounter.can_count(filename): return 0 parser = HtmlWordCounter.__HtmlParser() with open(filename, encoding="utf-8") as file: parser.feed(file.read()) return parser.count
def simplify_html(s): """Make a real HTML text compatible with Telegram's pseudo-HTML""" parser = _HtmlSimplifying() parser.feed(s) parser.close() return parser.result
def user_login(self, username, password): response = self._get_opener().open(CONF.dashboard.dashboard_url).read() # Grab the CSRF token and default region parser = HorizonHTMLParser() parser.feed(response.decode("utf-8")) # construct login url for dashboard, discovery accommodates non-/ web # root for dashboard login_url = parse.urljoin(CONF.dashboard.dashboard_url, parser.login) # Prepare login form request req = request.Request(login_url) req.add_header('Content-type', 'application/x-www-form-urlencoded') req.add_header('Referer', CONF.dashboard.dashboard_url) # Pass the default domain name regardless of the auth version in order # to test the scenario of when horizon is running with keystone v3 params = { 'username': username, 'password': password, 'region': parser.region, 'domain': CONF.auth.default_credentials_domain_name, 'csrfmiddlewaretoken': parser.csrf_token } self._get_opener().open(req, parse.urlencode(params).encode())
def get_dependencies(path): deps = set() parser = DependenciesParser(deps.add) with open(path) as f: parser.feed(f.read()) parser.close() return iter(deps)
def forum_attachments(attach_content): parser = p_attachments() parser.feed(attach_content) markdown_attachments = '\n\n>Attachments:\n' for item in parser._a: markdown_attachments += '\n>* ' + item return markdown_attachments
def html_to_markdown(content, host): with open('debug/out', 'w') as f: f.write(content) parser = guildwars2_html2markdown.Htmlparser() parser.convert_charrefs = True parser.host = 'https://' + host content = content.replace('\n', '\n>') parser.feed(content) # content = tag_bold(content) # content = tag_italic(content) # content = tag_list(content) # content = tag_superscript(content) # content = tag_strikethrough(content) # content = tag_underline(content) # content = tag_breakrow(content) # content = tag_h1(content) # content = tag_h2(content) # content = tag_h3(content) # content = tag_h4(content) # content = tag_h5(content) # content = tag_h6(content) # content = tag_hr(content) # content = tag_screenshot(content, host) # content = tag_paragraph(content) # content = tag_iframe(content, host) # content = tag_href(content, host) # content = tag_img(content, host) # content = tag_quote(content, host) # content = tag_spoiler(content, host) # content = tag_object(content) # content = content.strip('\n') # content = '>' + content.replace('\n', '\n>') # content = tag_other(content) print(parser.result) return parser.result
def main(): """Make all of the above work together to finally print the RSS feed.""" # Initial request html_string = get_response_body('/archive?type=episodes') # Prepare headers for following requests HEADERS['Referer'] = ( 'https://www.thisamericanlife.org/archive?type=episodes' ) HEADERS['X-Requested-With'] = 'XMLHttpRequest' parser = Parser() parser.feed(html_string) tree = parser.close() episodes = findall_episodes(tree) count = tree.find('.//div[@class="count-sort"]/div[@class="count"]').text count = int(count.split()[2]) for page in range(int(count / 48)): page = page + 1 time.sleep(1) json_string = get_response_body(f'/archive?type=episodes&page={page}') html_string = json.loads(json_string)['html'] parser = Parser() parser.feed(html_string) tree = parser.close() new_episodes = findall_episodes(tree) episodes = episodes + new_episodes RSS['rss']['channel']['item'] = episodes xml_tree = dictionary_to_xml(RSS) xml_string = xml.etree.ElementTree.tostring( xml_tree, encoding='utf-8', method='xml' ).decode() print(xml_string)
async def search(title, title_types, year=None): '''Search by title in the IMDB site. title_type might be one of: 'Movie', 'TV Series', 'Video', 'Short', 'TV Mini-Series', 'TV Movie', 'TV Episode' or 'Video Game'. ''' # Check that the type of title is correct for t in title_types: if t not in _SEARCH_TYPES: raise ValueError("wrong type '{}'".format(t)) # Build the URL of the search page search_attributes = {'ref_': 'nv_sr_fn', 's': 'tt', 'q': title} url = '{}?{}'.format(_IMDB_SEARCH_URL, urllib.parse.urlencode(search_attributes)) # Fetch the list of titles http_client = tornado.httpclient.AsyncHTTPClient() response = await http_client.fetch(url, headers=_HTTP_HEADERS) # Parse the desired information from the result parser = SearchParser() parser.feed(response.body.decode('utf-8')) # Return the list of titles # Keep only the titles with the right type or, if the year is given, those # corresponding to that year. return [ a for a in parser.results if a['type'] in title_types and ( year is None or a['year'] == year or a['year'] == year - 1) ]
def parse_category_ids(htmltext): """ Example: <div data-id="601" class="catalogue-list-item">Analog ICs <br></div> """ class CategoriesParser(html.parser.HTMLParser): categories = [] last_category_id = None def handle_starttag(self, tag, attrs): category_id = None for name, value in attrs: if name == 'data-id': category_id = value if category_id: self.last_category_id = category_id self.text = '' def handle_data(self, data): if self.last_category_id: self.text += data def handle_endtag(self, tag): if self.last_category_id: id_and_name = (self.last_category_id, self.text.strip()) self.last_category_id = None self.text = '' self.categories.append(id_and_name) print(id_and_name) parser = CategoriesParser() parser.feed(htmltext) return parser.categories
def parse(content, count): parser = OAParser(count) parser.feed(content) data = copy.deepcopy(parser.objects) # 不然上次的数据会残留 parser.clear() return data
def parse_jobisjob_offers(): offer_entries = [] parser = JobIsJobParser() url = "/search?what=stage&where=rennes&category=IT&jobType=Stage" jobisjob_connection = http.client.HTTPConnection("www.jobisjob.fr") jobisjob_connection.connect() jobisjob_connection.request("GET", url) jobisjob_response = jobisjob_connection.getresponse() page = jobisjob_response.read().decode("utf-8") parser.feed(page) jobisjob_connection.close() next_urls = copy.deepcopy(parser.next_pages) offer_entries = offer_entries + parser.offer_entries for next_url in next_urls: parser = JobIsJobParser() jobisjob_connection = http.client.HTTPConnection("www.jobisjob.fr") jobisjob_connection.connect() jobisjob_connection.request("GET", next_url) jobisjob_response = jobisjob_connection.getresponse() if jobisjob_response.getcode() == 301: actual_url = jobisjob_response.getheader("Location") jobisjob_connection.close() if actual_url is None: continue jobisjob_connection = http.client.HTTPConnection("www.jobisjob.fr") jobisjob_connection.connect() jobisjob_connection.request("GET", actual_url) jobisjob_response = jobisjob_connection.getresponse() page = jobisjob_response.read().decode("utf-8") parser.feed(page) jobisjob_connection.close() offer_entries = offer_entries + parser.offer_entries return offer_entries
def bibtex_entry_from_pmid(pmid: str) -> str: assert pmid.isdigit(), pmid resp = requests.get(_TEXMED_URL_PATTERN.format(pmid=pmid)) resp.raise_for_status() parser = _TeXMedHtmlParser() parser.feed(resp.text) return parser.bibtex_entry
def download_problem(contest_uri, problem): problem_uri = contest_uri + '/problem/' + problem print('Retrieving', problem_uri, '...') sys.stdout.flush() problem_html = urllib.request.urlopen(problem_uri).read().decode('utf-8') print('Retrieved problem {} ({} bytes).'.format(problem, len(problem_html))) # Hack for codeforces HTML errors problem_html = problem_html.replace('<p</p>', '<p></p>') problem_html = problem_html.replace('<ul</ul>', '<ul></ul>') problem_html = problem_html.replace('<div class="sample-test"<', '<div class="sample-test"><') parser = ProblemHTMLParser() try: parser.feed(problem_html) except: print(problem_html, file=sys.stderr) raise examples = parser.getExamples() problem_dir = problem.lower() if not os.path.isdir(problem_dir): os.mkdir(problem_dir) for i, example in enumerate(examples, 1): input_path = os.path.join(problem_dir, '{}.in.{}'.format(problem.lower(), i)) with open(input_path, 'w') as f: f.write(example[0]) output_path = os.path.join(problem_dir, '{}.out.{}'.format(problem.lower(), i)) with open(output_path, 'w') as f: f.write(example[1]) print('Wrote {} examples for problem {}.'.format(len(examples), problem))
def getCardDetails(self, uid): user = rfidUser() user.rfidCardUid = binascii.hexlify(uid).decode('ascii') #StaffId von ifaic mittels UID der Karte abfragen logging.debug("request ifaic staff id") req = urllib.request.Request(url=self._url+'cards/{}'.format(binascii.hexlify(uid).decode('ascii')), headers={"Content-Type":"application/json"}) userAndPass = base64.b64encode("{}:{}".format(self._user, self._password).encode()).decode("ascii") req.add_header("Authorization", 'Basic {:s}'.format(userAndPass)) resp = urllib.request.urlopen(req) body = resp.read() data = json.loads(body.decode()) user.staffId = data['ikaFkaIdentStaffId'] #Aus StaffId und ifaic Mitarbeiterseite Vorname und Nachname herausfinden #req = urllib.request.Request(url='https://ifaic.ika.rwth-aachen.de/info/mitarbeiterliste_komp.php') logging.debug("request ifaic user name") req = urllib.request.Request(url=self._mitarbeiterlisteUrl) resp = urllib.request.urlopen(req) body = resp.read() logging.debug("parse response") #Für schnelleres Parsen wird beim finden eines Ergebnis eine Exception geworfen try: parser = ifaicMitarbeiterlisteParser(user.staffId) parser.feed(body.decode('iso-8859-1')) except: logging.debug("parsed") logging.debug(parser.userName) user.surname = parser.surname user.givenName = parser.givenName user.userName = parser.userName return user return None
def auth_usr(email, password, client_id, scope, opener): print("TRY TO AUTH") #TODO словить эксепшн login_page = "http://oauth.vk.com/oauth/authorize?" + \ "redirect_uri=http://oauth.vk.com/blank.html&response_type=token&" + \ "client_id=%s&scope=%s&display=wap" % (client_id, ",".join(scope)) #print(login_page) auth_page = opener.open("http://oauth.vk.com/oauth/authorize?" + \ "redirect_uri=http://oauth.vk.com/blank.html&response_type=token&" + \ "client_id=%s&scope=%s&display=wap" % (client_id, ",".join(scope))) auth_page = auth_page.read() parser = AuthParser() parser.feed(str(auth_page)) parser.close() if not parser.form_parsed or parser.url is None or "pass" not in parser.params or \ "email" not in parser.params or parser.method != "POST": parser.error = "Some problems" if parser.error != "OK": return -1, -1, parser.error parser.params["email"] = email parser.params["pass"] = password parser.params["v"] = "5.2" #TODO словить эксепшн response = opener.open(parser.url, urllib.parse.urlencode(parser.params).encode("UTF-8")) page = response.read() url = response.geturl() return page, url, parser.error
def handle_body(body): parser = HTMLTitleParser() parser.feed(body.decode('utf-8')) if parser.title_data == self._library_title: return True return False
def title(self, irc, msg, args, optlist, url): """[--no-filter] <url> Returns the HTML <title>...</title> of a URL. If --no-filter is given, the bot won't strip special chars (action, DCC, ...). """ if not self._checkURLWhitelist(url): irc.error("This url is not on the whitelist.") return size = conf.supybot.protocols.http.peekSize() text = utils.web.getUrl(url, size=size) try: text = text.decode(utils.web.getEncoding(text) or 'utf8', 'replace') except: pass parser = Title() try: parser.feed(text) except html.parser.HTMLParseError: self.log.debug('Encountered a problem parsing %u. Title may ' 'already be set, though', url) if parser.title: title = utils.web.htmlToText(parser.title.strip()) if not [y for x,y in optlist if x == 'no-filter']: for i in range(1, 4): title = title.replace(chr(i), '') irc.reply(title) elif len(text) < size: irc.reply(_('That URL appears to have no HTML title.')) else: irc.reply(format(_('That URL appears to have no HTML title ' 'within the first %S.'), size))
def _query_eol_list(self) -> typing.List[str]: """Scrape the FreeBSD website and return a list of EOL RELEASES.""" request = urllib.request.Request( self.eol_url, headers={ "Accept-Charset": "utf-8" } ) self.logger.verbose(f"Downloading EOL info from {self.eol_url}") with urllib.request.urlopen(request) as response: # nosec: B310 response_code = response.getcode() if response_code != 200: # noqa: T484 libioc.errors.DownloadFailed( topic="EOL Warnings", code=response_code, logger=self.logger, level="warning" ) return [] parser = EOLParser() data = response.read().decode("utf-8", "ignore") parser.feed(data) parser.close() return parser.eol_releases
def latest_simc_version( major_ver: str = _CURRENT_SIMC_VERSION, platform: Optional[str] = None) -> Optional[Tuple[str, str]]: """ Checks the SimulationCraft nightly builds for the latest binary build. Args: major_ver: A major version of simc, eg: '910-01'. platform: Platform to check, eg: 'win64', 'macos', or None to auto-detect. Returns: tuple of (filename, git_commit) with the latest version. """ if not platform: platform = simc_platform() if not platform: # Unsupported platform, cannot answer. return listing = urlopen('http://downloads.simulationcraft.org/nightly/?C=M;O=D' ).read().decode('utf-8') parser = LinkParser() parser.feed(listing) # Parse the filenames for link in parser.links: v = _PACKAGE_RE.match(link) if not v: continue if major_ver == v.group('version') and platform == v.group('platform'): return (v.group('filename'), v.group('git_commit'))
def parse_indeed_offers(): offer_entries = [] parser = IndeedParser() url = "/emplois?q=informatique&l=Rennes+(35)&radius=5&jt=internship&sort=date" indeed_connection = http.client.HTTPConnection("www.indeed.fr") indeed_connection.connect() indeed_connection.request("GET", url) indeed_response = indeed_connection.getresponse() parser.feed(indeed_response.read().decode("utf-8")) indeed_connection.close() next_urls = copy.deepcopy(parser.next_pages) offer_entries = offer_entries + parser.offer_entries for next_url in next_urls: parser = IndeedParser() indeed_connection = http.client.HTTPConnection("www.indeed.fr") indeed_connection.connect() indeed_connection.request("GET", next_url) indeed_response = indeed_connection.getresponse() if indeed_response.getcode() == 301: actual_url = indeed_response.getheader("Location") indeed_connection.close() if actual_url is None: continue indeed_connection = http.client.HTTPConnection("www.indeed.fr") indeed_connection.connect() indeed_connection.request("GET", actual_url) indeed_response = indeed_connection.getresponse() page = indeed_response.read().decode("utf-8") parser.feed(page) indeed_connection.close() offer_entries = offer_entries + parser.offer_entries return offer_entries
def extract_angular(fileobj, keywords, comment_tags, options): """Extract messages from angular template (HTML) files. It extract messages from angular template (HTML) files that use angular-gettext translate directive as per https://angular-gettext.rocketeer.be/ :param fileobj: the file-like object the messages should be extracted from :param keywords: This is a standard parameter so it isaccepted but ignored. :param comment_tags: This is a standard parameter so it is accepted but ignored. :param options: Another standard parameter that is accepted but ignored. :return: an iterator over ``(lineno, funcname, message, comments)`` tuples :rtype: ``iterator`` """ parser = AngularGettextHTMLParser() for line in fileobj: parser.feed(encodeutils.safe_decode(line)) for string in parser.strings: yield (string)
def getUrlToPdf(): """ Sucht auf der Lieferdienst-Website nach der Speisekarten-URL: - durchsucht Website nach Link zur Speisekarte - erstellt URL aus Website-URL und Referenz zu Speisekarte :return: zusammengesetzte URL """ # Website aufrufen und html code auslesen r = urlopen(_capriUrl + 'lieferservice.html') html = r.read().decode("utf-8") parser = HTMLParser() parser.feed(html) links = parser.getLinks() print(parser.getLinks()) # Check all found links for link in links: # Check for link to pdf file in img folder if link.startswith("img/") and link.endswith(".pdf"): linkToPdf = _capriUrl + link print("Link to pdf:", linkToPdf) return linkToPdf # No link to pdf found return None
def download_problem(contest_uri, problem): problem_uri = contest_uri + '/problem/' + problem print('Retrieving', problem_uri, '...') sys.stdout.flush() problem_html = urllib.request.urlopen(problem_uri).read().decode('utf-8') print('Retrieved problem {} ({} bytes).'.format(problem, len(problem_html))) # Hack for codeforces HTML errors problem_html = problem_html.replace('<p</p>', '<p></p>') problem_html = problem_html.replace('<ul</ul>', '<ul></ul>') problem_html = problem_html.replace('<div class="sample-test"<', '<div class="sample-test"><') parser = ProblemHTMLParser() try: parser.feed(problem_html) except: print(problem_html, file=sys.stderr) raise examples = parser.getExamples() for i, example in enumerate(examples, 1): input_path = INPUT_TESTCASE_FORMAT.format(problem=problem, number=i) write_example(input_path, example[0]) output_path = OUTPUT_TESTCASE_FORMAT.format(problem=problem, number=i) write_example(output_path, example[1]) print('Wrote {} examples for problem {}.'.format(len(examples), problem))
async def __fetch_and_feed(self, client: aiohttp.client.ClientSession, url, parser: htmlParser, client_num: int): # logger.debug(f"client - {client_num}, try to get url <{url}>") resp: aiohttp.client.ClientResponse while True: async with client.get(url) as resp: if resp.status < 200 or resp.status >= 300: logger.warning(f"server response - {resp.status}") break try: html_data = await resp.text() except UnicodeDecodeError as err: logger.warning(err) break # print(html_data) parser.feed(html_data) m_v = html_url_regex_3.match(url) if m_v is None or m_v.group(4) == "": logger.warning(f"Processing url <{url}> cause an exception, url isn't correspond with content.") break PageId = int(m_v.group(4)) if parser.HasContent and (parser.BookName != "" and parser.BookName is not None): AppendContext(parser.BookName, PageId, parser.Content) break await self.__client_list.ReleaseClient(client_num) return
def tag_screenshot(content, host): re_sss = re.findall('<p class="screenshot">.*?<\/p>', content) for re_ss in re_sss: if re_ss != '': parser = p_screenshots() parser.feed(re_ss) content = content.replace(re_ss, parser._src + '\n') return content
def tag_iframe(content, host): re_ifrs = re.findall('<iframe.*?src="[^"]*".*?>.*?<\/iframe>', content) for re_ifr in re_ifrs: if re_ifr != '': parser = p_iframe() parser.feed(re_ifr) content = content.replace(re_ifr, parser._src) return content
def getBuildsFromPage(self, appId, pageIndex): request = urllib.request.Request("https://www.testflightapp.com/dashboard/applications/%d/builds/?page=%d" % (appId, pageIndex + 1)) response = self.opener.open(request); htmlContent = response.read().decode('ascii') parser = self.BuildsOnPageParser() parser.feed(htmlContent) print("Found %d builds on page %d" % (len(parser.builds), pageIndex + 1)) return parser.builds
def run(self): parser = Parser() with open(os.path.join('..','docs','docs.polserver.com','pol099','include','escriptguide.inc')) as f: while True: l = f.readline() if not l: break parser.feed(l)
def search(db, terms): f = open(db) xml = f.readlines() f.close() parser = BayParser() parser.q = (' '.join(terms)).lower() parser.feed(''.join(xml)) return parser.results
def parse(company_number: str) -> (str, str, str): html = _fetch_company_data(company_number) if not html: return None parser = SummaryParser() parser.feed(html) return parser.company_status, parser.company_incorporated, parser.company_type
def parse(markup): parser = Parser() parser.feed(markup) gen = HtmlGenerator(parser.parse_result) try: return gen.get_html() except CompileException as ex: return '<h1>Error</h1><pre>{}</pre>'.format(ex)
def loadPages(self,dir): for filename in os.listdir(dir): pattern = re.compile(".*\.html") parser = self.WordCounter() if re.match(pattern,filename): file = open(dir+filename,'r') parser.feed(file.read()) file.close() self.sitesDict[filename] = parser.wordCount
def to_json(content, raise_exception=True): """ converts HTML into JSON @param content HTML content to parse @param raise_exception if True, raises an exception if the HTML is malformed, otherwise does what it can """ parser = HTMLtoJSONParser(raise_exception=raise_exception) parser.feed(content) return parser.json
def parse_links(self): 'Parse out the links found in download HTML file' f = open(self.file,'r') data = f.read() f.close() parser = html.parser.HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(io.StringIO()))) parser.feed(data) parser.close return parser.anchorlist
def scrape(path): request = urllib.request.Request('http://en.wikipedia.org' + path) request.add_header('User-agent', 'Mozilla/5.0') response = urllib.request.urlopen(request, timeout=10) parser = Parser() parser.feed(response.read().decode()) return { key: value for key, value in parser.__dict__.items() if key in [ 'influenced', 'influenced_by', 'appeared_in']}