def get_wiki_info(name): url = "http://ja.wikipedia.org/w/index.php?title={0}&action=edit".format(quote(name)) data = get_page_with_cache("edit_"+name, url) inside_textarea = False lon = lat = None # for each line for line in data.splitlines(): # make sure that we only analyze inside the textarea if inside_textarea==False: if line.find("<textarea")!=-1: inside_textarea = True continue line = line.strip() # after this mark, no interesting data if line.find("</textarea")!=-1: inside_textarea = False continue if line.find("{{aimai}}")!=-1: raise Exception("disambiguation page") # Detect Mediawiki tagging: |key = value if line.startswith("|")==False or line.find("=")==-1: continue # extract the mediawiki key and value from the line # which could be for example: # |よみがな = とうきょう try: key, val = line[1:].split("=", 1) key = key.strip() val = val.split("<")[0].strip() val = parser.unescape(parser.unescape(val)) except Exception as e: continue if key=="座標": # example: {{ウィキ座標2段度分秒|34|45|47.43|N|135|31|25.21|E|}} fields = val.split("|") if len(fields)>=7: lat = convert_coord(fields[1:4]) or lat lon = convert_coord(fields[5:8]) or lon # other coordinate tag which car contain lat or lon, or both elif key=="緯度度" or key=="経度度": pairs = dict(map(lambda x: x.strip(), p.split("=")) for p in line[1:].split("|")) lat_fields = (pairs.get("緯度度"), pairs.get("緯度分"), pairs.get("緯度秒")) lon_fields = (pairs.get("経度度"), pairs.get("経度分"), pairs.get("経度秒")) if None not in lat_fields: lat = convert_coord(lat_fields) if None not in lon_fields: lon = convert_coord(lon_fields) # if not lat or not lon: # print(name, lat, lon) return lat, lon
def set_request_cookie(self, user_id): """ POST request for non-json content body :param user_id: userid used to send the request :type user_id:string """ # Assumes password is always user_id + 1234 password = user_id + '1234' # Make a request to Access the smarter URL for the purpose of being re directed to the OPEN AM page. self.send_request("GET", "/data") json_response = self._response.json() redirect = json_response['redirect'] self.send_request("GET", redirect, use_base=False) # This should redirect us to IDP page. Extract the response message. response = self._response.content.decode('utf-8') # Search for regular expressions from the response body goto = re.search('name=\\"goto\\" value=\\"(.*)\\"', response).group(1) sun_query = re.search('name=\\"SunQueryParamsString\\" value=\\"(.*)\\"', response).group(1) self.set_request_header('content-type', 'application/x-www-form-urlencoded') # Get the LOGIN FORM. Compose a redirect PATH using the parameters extracted from the last GET request made to smarter # Submit the request to get the login form from IDP. request_data = {'goto': goto, 'SunQueryParamsString': sun_query, 'IDButton': 'Log In', 'gx_charset': 'UTF-8', 'encoded': 'true', 'IDToken1': user_id, 'IDToken2': password} # Send login request to IDP self.send_post(preferences(Default.idp), request_data) # Extract the response received from IDP response = self._response.content.decode('utf-8') # https: Submit the login information in the login form received from the from previous request a # and send a post request to smarter to get the cookie information parser = html.parser.HTMLParser() url = re.search('action=\\"(.*?)\\"', response).group(1) samlresponse = re.search('name=\\"SAMLResponse\\" value=\\"(.*?)\\"', response).group(1) relaystate = re.search('name=\\"RelayState\\" value=\\"(.*?)\\"', response).group(1) data = {'SAMLResponse': samlresponse, 'RelayState': relaystate} self.set_request_header('content-type', 'application/x-www-form-urlencoded') # unescape the strings url = parser.unescape(str(url)) data['SAMLResponse'] = parser.unescape(str(data['SAMLResponse'])) data['RelayState'] = parser.unescape(str(data['RelayState'])) # Send post request self.send_post(url, data) response = self._response.content.decode('utf-8') # Get the cookie from response cookie_value = self._response.cookies self._request_header['cookies'] = cookie_value
def format_transcript_element(self, element, element_number): """ Format transcript's element in order for it to be converted to WebVTT format. """ sub_element = "\n\n" html_parser = HTMLParser() if element.tag == "text": start = float(element.get("start")) duration = float(element.get("dur", 0)) # dur is not mandatory text = element.text end = start + duration if text: formatted_start = self.format_transcript_timing(start) formatted_end = self.format_transcript_timing(end, 'end') timing = '{} --> {}'.format(formatted_start, formatted_end) text_encoded = text.encode('utf8', 'ignore') text = text_encoded.replace(b'\n', b' ') unescaped_text = unescape(text.decode('utf8')) sub_element = """\ {element_number} {timing} {unescaped_text} """.format(element_number=element_number, timing=timing, unescaped_text=unescaped_text) return textwrap.dedent(sub_element)
def clean(text): """ A function for cleaning a string of text. Returns valid ASCII characters. """ import sys, unicodedata text = clean_whitespace(text) # Remove links from message #text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) # Replace HTML escape characters if sys.version_info[0] < 3: from HTMLParser import HTMLParser parser = HTMLParser() text = parser.unescape(text) else: import html.parser parser = html.parser.HTMLParser() text = parser.unescape(text) # Normalize unicode characters # 'raw_input' is just 'input' in python3 if sys.version_info[0] < 3: text = unicode(text) text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8") return str(text)
def enrich_data(self, item_loader, response): item_loader.add_re("id", r'tid=(\d+)') title = "".join(response.xpath('//h4/text()').extract()) item_loader.add_value("title", title) video_url = unescape("".join( response.selector.re(r"\.src='(.*)#iframeload'"))) return [("video_url", item_loader, {"url": video_url})]
def run(self): """ Checks through the submissions and archives and posts comments. """ if not self._setup: raise Exception("Snapshiller not ready yet!") submissions = r.get_new(limit=self.limit) for submission in submissions: # Your crap posts aren't worth wasting precious CPU cycles and # archive.is and archive.org's bandwith. HAIL ELLEN PAO if submission.author and submission.author.name == "PoliticBot": log.info("Submisson by banned user; skipping.") continue log.debug("Found submission.\n" + submission.permalink) if not should_notify(submission): log.debug("Skipping.") continue archives = [ArchiveContainer(fix_url(submission.url), "*This Post*")] if submission.is_self and submission.selftext_html is not None: log.debug("Found text post...") links = BeautifulSoup(unescape( submission.selftext_html)).find_all("a") if not len(links): continue for anchor in links: log.debug("Found link in text post...") url = fix_url(anchor['href']) archives.append(ArchiveContainer(url, anchor.contents[0])) ratelimit(url) Notification(submission, self._get_header(submission.subreddit), archives).notify() db.commit()
def clean(text): """ A function for cleaning a string of text. Returns valid ASCII characters. """ import unicodedata import sys text = clean_whitespace(text) # Remove links from message # text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) # Replace HTML escape characters if sys.version_info[0] < 3: from HTMLParser import HTMLParser parser = HTMLParser() text = parser.unescape(text) else: import html.parser parser = html.parser.HTMLParser() text = parser.unescape(text) # Normalize unicode characters # 'raw_input' is just 'input' in python3 if sys.version_info[0] < 3: text = unicode(text) text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8") return str(text)
def parse_from_url(url: str) -> dict: """调用 newspaper 处理 url""" ret = Article( url, browser_user_agent= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.16 Safari/537.36 Edg/79.0.309.15', request_timeout=30, keep_article_html=True) ret.download() # 下载网页 if ret.download_state == 2: ret.parse() # 解析网页 item = { 'url': url, 'title': ret.title, 'keywords': ret.meta_keywords, 'description': ret.meta_description, 'author': ret.authors, 'publishdate': str(ret.publish_date) if ret.publish_date else '', 'content': ret.text, 'content_html': re.sub(r'\r|\n|\t', '', unescape(ret.article_html)), } return item else: raise Exception( f'Page download error, download_state: {ret.download_state}')
def run(self): """ Checks through the submissions and archives and posts comments. """ if not self._setup: raise Exception("Snapshill not ready yet!") submissions = self.reddit.front.new(limit=self.limit) for submission in submissions: debugTime = time.time() warned = False log.debug("Found submission.\n" + submission.permalink) if not should_notify(submission): log.debug("Skipping.") continue archives = [ ArchiveContainer(fix_url(submission.url), submission.title) ] if submission.is_self and submission.selftext_html is not None: log.debug("Found text post...") links = BeautifulSoup(unescape( submission.selftext_html)).find_all("a") finishedURLs = [] for anchor in links: if time.time() > debugTime + WARN_TIME and not warned: log.warn( "Spent over {} seconds on post (ID: {})".format( WARN_TIME, submission.name)) warned = True log.debug("Found link in text post...") url = fix_url(anchor["href"]) if skip_url(url): continue if url in finishedURLs: continue # skip for sanity archives.append(ArchiveContainer(url, anchor.contents[0])) finishedURLs.append(url) ratelimit(url) Notification( self.reddit, submission, self._get_header(submission.subreddit), archives, ).notify() db.commit()
def crawl(urls, links=100): res = set() crawled = set() q = deque(urls) parser = html.parser.HTMLParser() while len(q) > 0 and len(crawled) < links: url = q.popleft() if url in crawled: continue try: req = requests.get(url) except: continue if req.status_code != requests.codes.ok: continue print(url, file=sys.stderr) req.encoding = 'utf-8' contents = parser.unescape(req.text) newLinks = page_links(contents) for l in newLinks: q.extend([l]) newProxies = page_leech(contents) for p in newProxies: res.add(p) crawled.add(url) return list(res)
def get_rte_components(html_string): """Extracts the RTE components from an HTML string. Args: html_string: str. An HTML string. Returns: list(dict). A list of dictionaries, each representing an RTE component. Each dict in the list contains: - id: str. The name of the component, i.e. 'oppia-noninteractive-link'. - customization_args: dict. Customization arg specs for the component. """ parser = html.parser.HTMLParser() components = [] soup = bs4.BeautifulSoup(html_string, 'html.parser') oppia_custom_tag_attrs = ( rte_component_registry.Registry.get_tag_list_with_attrs()) for tag_name in oppia_custom_tag_attrs: component_tags = soup.find_all(name=tag_name) for component_tag in component_tags: component = {'id': tag_name} customization_args = {} for attr in oppia_custom_tag_attrs[tag_name]: # Unescape special HTML characters such as '"'. attr_val = parser.unescape(component_tag[attr]) # Adds escapes so that things like '\frac' aren't # interpreted as special characters. attr_val = attr_val.encode('unicode_escape') customization_args[attr] = json.loads(attr_val) component['customization_args'] = customization_args components.append(component) return components
def typeAnsAnswerFilter(self, buf): if not self.typeCorrect: return re.sub(self.typeAnsPat, "", buf) origSize = len(buf) buf = buf.replace("<hr id=answer>", "") hadHR = len(buf) != origSize # munge correct value parser = html.parser.HTMLParser() cor = stripHTML(self.mw.col.media.strip(self.typeCorrect)) # ensure we don't chomp multiple whitespace cor = cor.replace(" ", " ") cor = parser.unescape(cor) cor = cor.replace("\xa0", " ") given = self.typedAnswer # compare with typed answer res = self.correct(given, cor, showBad=False) # and update the type answer area def repl(match): # can't pass a string in directly, and can't use re.escape as it # escapes too much s = """ <span style="font-family: '%s'; font-size: %spx">%s</span>""" % ( self.typeFont, self.typeSize, res) if hadHR: # a hack to ensure the q/a separator falls before the answer # comparison when user is using {{FrontSide}} s = "<hr id=answer>" + s return s return re.sub(self.typeAnsPat, repl, buf)
def typeAnsAnswerFilter(self, buf): if not self.typeCorrect: return re.sub(self.typeAnsPat, "", buf) origSize = len(buf) buf = buf.replace("<hr id=answer>", "") hadHR = len(buf) != origSize # munge correct value parser = html.parser.HTMLParser() cor = self.mw.col.media.strip(self.typeCorrect) cor = re.sub("(\n|<br ?/?>|</?div>)+", " ", cor) cor = stripHTML(cor) # ensure we don't chomp multiple whitespace cor = cor.replace(" ", " ") cor = parser.unescape(cor) cor = cor.replace("\xa0", " ") cor = cor.strip() given = self.typedAnswer # compare with typed answer res = self.correct(given, cor, showBad=False) # and update the type answer area def repl(match): # can't pass a string in directly, and can't use re.escape as it # escapes too much s = """ <span style="font-family: '%s'; font-size: %spx">%s</span>""" % ( self.typeFont, self.typeSize, res) if hadHR: # a hack to ensure the q/a separator falls before the answer # comparison when user is using {{FrontSide}} s = "<hr id=answer>" + s return s return re.sub(self.typeAnsPat, repl, buf)
def toggle_checkbox_ajax(): with session_scope() as session: try: db_type = request.form['db_type'] db_id = int(request.form['db_id']) item_number = int(request.form['item_number'][9:]) parser = html.parser.HTMLParser() item_text = parser.unescape(request.form['item_text']) if db_type == 'Source': db_ob = db.Source elif db_type == 'Series': db_ob = db.Series elif db_type == 'Term': db_ob = db.Term else: return jsonify({"status" : "failure"}) try: ob = session.query(db_ob).get(int(db_id)) if not ob: return jsonify({"status" : "failure"}) ob.notes = utils.toggle_checkbox(ob.notes, item_number, item_text) session.commit() return jsonify({"status" : "success"}) except: return jsonify({"status" : "failure"}) except: session.rollback() return jsonify({"status" : "failure"})
def get_msg(self): if "none" not in self._response.url: try: self._selector = etree.HTML(self._response.content) self._lis = self._selector.xpath( "//div[@class = 'basic-info cmn-clearfix']") self._name = [unescape(x.xpath('string(.)')).strip().replace("\xa0","") \ for x in self._lis[0].xpath("//dt[@class = 'basicInfo-item name']")] self._value = [unescape(x.xpath('string(.)')).strip().replace("\xa0","") \ for x in self._lis[0].xpath("//dd[@class = 'basicInfo-item value']")] self.msg = dict(zip(self._name, self._value)) except: self.msg = {} else: self.msg = {}
def unescape(text): if (sys.version_info[0] < 3): print "1" parser = HTMLParser.HTMLParser() else: print "2" parser = html.parser.HTMLParser() return (parser.unescape(text))
def render_resource(path, **context): """ Render static resource using provided context. Returns: django.utils.safestring.SafeText """ html = Template(resource_string(path)) return html_parser.unescape(html.render(Context(context)))
def create_song(raw): """ 创建歌曲类 :param raw: :return: """ parser = html.parser song = Song() song.title = parser.unescape(raw['songName']) song.artist = parser.unescape(raw['artist']) song.album_name = parser.unescape(raw['album_name']) song.song_id = raw['song_id'] song.album_id = raw['album_id'] song.location = raw['location'] song.lyric_url = raw['lyric_url'] song.pic_url = raw['pic'] return song
def parse_html(html): data = html result_html = etree.HTML(data) items = result_html.xpath('//div[contains(@class,"userContentWrapper")]') result = '' for item in items: data = tostring(item, method='html') result = result + unescape(data.decode()) return result
def get_content(): raw_text = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '.main'))).text content = '' page_list = browser.find_elements_by_class_name('reader-txt-layer') for page in page_list: content += (unescape(page.text) + '\n') return content, raw_text
def replaceHTMLCodes(txt): txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt) if version_info[0] == 3: txt = HTMLParser.unescape(txt) else: txt = HTMLParser.HTMLParser().unescape(txt) txt = txt.replace(""", '"') txt = txt.replace("&", "&") return txt
def get_tv_series(url): sess.headers.update({ 'User-Agent': USER_AGENT, 'Referer': HEJO_TV_BASE_URL, }) html = get_url(url) xbmc.log("get_tv_series \n{}".format(html), level=xbmc.LOGINFO) dom = HtmlDom().createDom(html) series_list = dom.find('div.ml-item') xbmc.log("get_tv_series, found {} series".format(series_list.len), level=xbmc.LOGINFO) for series in series_list: xbmc.log("{}\n\n\n\n".format(series.html()), level=xbmc.LOGINFO) series_url = series.find('a.ml-mask').attr('href') series_title = unescape(series.find('a.ml-mask').attr('title')) series_poster = "{}{}".format( HEJO_TV_BASE_URL, series.find('img.thumb').first().attr('src')) series_description = unescape( series.find('div#hidden_tip').first().text().strip()) series_info = { 'plot': series_description, 'title': series_title, } xbmc.log("{}, {}, {}".format(series_url, series_poster, series_description), level=xbmc.LOGINFO) add_item(name=series_title, url=series_url, mode='get_tv_stream', image=series_poster, folder=False, isplay=False, infoLabels=series_info, itemcount=series_list.len) xbmcplugin.setContent(addon_handle, 'videos') xbmcplugin.endOfDirectory(addon_handle, True)
def run(self): """ Checks through the submissions and archives and posts comments. """ if not self._setup: raise Exception("Snapshiller not ready yet!") submissions = r.get_new(limit=self.limit) for submission in submissions: debugTime = time.time() warned = False log.debug("Found submission.\n" + submission.permalink) if not should_notify(submission): log.debug("Skipping.") continue archives = [ArchiveContainer(fix_url(submission.url), "*This Post*")] if submission.is_self and submission.selftext_html is not None: log.debug("Found text post...") links = BeautifulSoup(unescape( submission.selftext_html)).find_all("a") if not len(links): continue finishedURLs = [] for anchor in links: if time.time() > debugTime + WARN_TIME and not warned: log.warn("Spent over {} seconds on post (ID: {})".format( WARN_TIME, submission.name)) warned = True log.debug("Found link in text post...") url = fix_url(anchor['href']) if skip_url(url): continue if url in finishedURLs: continue #skip for sanity archives.append(ArchiveContainer(url, anchor.contents[0])) finishedURLs.append(url) ratelimit(url) Notification(submission, self._get_header(submission.subreddit), archives).notify() db.commit()
def about(): hot_posts = _get_hot() tags = _get_tags() post = Post.query.filter(Post.category_id == 0).first_or_404() post.content = unescape(post.content) return render_template('about.html', title='关于作者', hot=hot_posts, tags=tags, post=post)
def get_content(): content = '' page_list = browser.find_elements_by_xpath('//div[contains(@id,"pageNo")]') for page in page_list: content += (unescape(page.text) + '\n') print(page.text) content = content.splitlines() content = ''.join(content) content = re.sub(r'\n\t\f\r', content) return content
def render(self, email_msg, context=None): if self.hydrated_template is None: sections_text = ''.join(self.sections) self.hydrated_template = Template(HtmlEmailTemplate.base_template.render({"content": sections_text})) email_msg.content_subtype = "html" # For some reason xml markup characters in the template (<,>) get converted to entity codes (< and &rt;) # We unescape to convert the markup characters back _context = context or {} _context['use_signature'] = self.use_signature email_msg.body = unescape(self.hydrated_template.render(Context(_context))) return email_msg
def get_title(cls, data): parser = cls(convert_charrefs=True) try: parser.feed(data) except: # many bugs lol return None title = parser.title if title is None: return None title = title.strip() if title: return re_space.sub(' ', unescape(title))
def get_mul(self): self._mul = self._selector.xpath( "//ul[@class ='polysemantList-wrapper cmn-clearfix']") if self._mul: self.other_type = [ unescape(x.xpath('string(.)')).strip().replace("\xa0","")\ for x in self._mul[0].xpath("//li[@ class='item']//a")] self._type = self._mul[0].xpath( "//li[@ class='item']//span")[0].text else: self._mul = [] self._type = ""
def render_template(template_name, **context): """ Render static resource using provided context. Returns: django.utils.safestring.SafeText """ template_dirs = [os.path.join(os.path.dirname(__file__), 'static/html')] libraries = {'i18n': 'django.templatetags.i18n'} engine = Engine(dirs=template_dirs, debug=True, libraries=libraries) html = engine.get_template(template_name) return html_parser.unescape(html.render(Context(context)))
def decode_html(value): """ Descodifica las HTML entities @param value: valor a decodificar @type value: str """ try: unicode_title = unicode(value, "utf8", "ignore") return unescape(unicode_title).encode("utf8") except: if PY3 and isinstance(value, bytes): value = value.decode("utf8") return value
def get_bycategory(cid, page_num): if not page_num or int(page_num) < 1: page_num = 1 paginate = Post.query.order_by(Post.post_time.desc()).filter_by(stype=1).filter_by(category_id=cid). \ filter_by(status=1).paginate(int(page_num), gl.index_page_limit, True) posts = paginate.items for p in posts: p.comment_counts = 0 p.time = time.strftime('%Y-%m-%d', time.localtime(p.post_time)) p.content = unescape(p.content) hot = _get_hot() tags = _get_tags() return render_template('type.html', title='分类', posts=posts, pagination=paginate, cid=cid, hot=hot, tags=tags)
def run(self): """ Checks through the submissions and archives and posts comments. """ subreddit1 = reddit.subreddit(working_sub) for submission in subreddit1.stream.submissions(): if submission.id not in posts_replied_to: posts_replied_to.append(submission.id) debugTime = time.time() warned = False with open("drama_posts_replied_to.txt", "a") as posts: posts.write("{}\n".format(submission.id)) posts.close() log.info("Found submission.: {}".format(submission.permalink)) archives = [ArchiveContainer(fix_url(submission.url),"*This Post*")] if submission.is_self and submission.selftext_html is not None: log.debug("Found text post...") links = BeautifulSoup(unescape( submission.selftext_html)).find_all("a") if not len(links): continue finishedURLs = [] for anchor in links: if time.time() > debugTime + WARN_TIME and not warned: log.warn("Spent over {} seconds on post (ID: {})".format(WARN_TIME, submission.name)) warned = True log.debug("Found link in text post...") url = fix_url(anchor['href']) if skip_url(url): continue if url in finishedURLs: continue #skip for sanity archives.append(ArchiveContainer(url, anchor.contents[0])) finishedURLs.append(url) ratelimit(url) time.sleep(50) Notification(submission, archives).notify() time.sleep(12)
def get_sourcecode(url): try: ssl._create_default_https_context = ssl._create_unverified_context r = urllib.request.Request( url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0' }) r_source = urllib.request.urlopen(r) r_source_read = str(r_source.read()) return unescape(r_source_read) except: return None
def _get_quotes(wiki_page): # Remove remaining escape characters from wiki content quotes = unescape(wiki_page.content_md) # Remove comment lines starting with # or ; including any leading whitespace quotes = re.sub('^[ \t]*[#;].*$', '', quotes, flags=re.MULTILINE) # Split and strip the quotes into an array using --- as a delimiter quotes = [quote.strip() for quote in quotes.split('---')] # Remove any blank quotes quotes = [quote for quote in quotes if quote] return quotes
def index(): page_num = request.args.get('page_num') if not page_num or int(page_num) < 1: page_num = 1 paginate = Post.query.filter_by(status=1).filter_by(stype=1).filter(Post.category_id > 0). \ order_by(Post.post_time.desc()).paginate(int(page_num), gl.index_page_limit, True) posts = paginate.items for p in posts: p.comment_counts = 0 p.time = str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(p.post_time))[0:-9]) p.content = unescape(p.content) hot_posts = _get_hot() tags = _get_tags() return render_template('home.html', title='首页', posts=posts, pagination=paginate, hot=hot_posts, tags=tags)
def rankings(): last_updated = update_check() postition = request.form["action"] print(postition) player = [{'rank':.5}] player+=get_all_players(postition) player.append({'rank':len(player)}) test = '[' for item in player: test+=(json.dumps(item)) + ', ' test += ']' print(test) test = parser.unescape(test) return render_template('rankings.html', updated=last_updated, type=postition, data=test)
def getNews(symbol): url = buildNewsUrl(symbol) content = urlopen(url).read().decode('utf-8') content_json = demjson.decode(content) article_json = [] news_json = content_json['clusters'] for cluster in news_json: for article in cluster: if article == 'a': article_json.extend(cluster[article]) return [[unescape(art['t']).strip(), art['u']] for art in article_json]
def format_transcript_text(self, text): """ Prepare unescaped transcripts to be converted to WebVTT format. """ new_text = [ self.format_transcript_text_line(line) for line in text[0].splitlines() ] new_text = '\n'.join(new_text) unescaped_text = html_parser.unescape(new_text) if "WEBVTT" not in text: text = "WEBVTT\n\n" + unescaped_text else: text = unescaped_text return text
def test_reset_proceed_wrong_confirm(user, db_session, default_app): """Reset test for reseting pasword with notmatched passwords.""" user = db_session.merge(user) user.set_reset() transaction.commit() user = db_session.merge(user) res = default_app.get("/password/reset/" + user.reset_key) res.form["password"] = NEW_PASSWORD res.form["confirm_password"] = NEW_PASSWORD + "Typo" res = res.form.submit() assert "Error! Password doesn't match" in unescape( res.body.decode("unicode_escape"))
def test_register_error(db_session, default_app, email, password, confirm_password, error): """Error in registration process.""" assert db_session.query(User).count() == 0 res = default_app.get("/register") if email is not None: res.form["email"] = email res.form["password"] = password res.form["confirm_password"] = confirm_password res = res.form.submit(extra_environ={"REMOTE_ADDR": "0.0.0.0"}) transaction.commit() assert error in unescape(res.body.decode("unicode_escape")) assert db_session.query(User).count() == 0
def get_detail(pid): post = Post.query.filter_by(id=pid).filter_by(status=1).filter_by(stype=1).filter(Post.category_id > 0). \ first_or_404() post.view_counts += 1 db.session.flush() pre_post = Post.query.order_by(Post.id.desc()).filter_by(status=1).filter_by(stype=1).filter(Post.id < pid).first() next_post = Post.query.order_by(Post.id.asc()).filter(Post.id > pid).filter_by(status=1).filter_by(stype=1).first() post.time = str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(post.post_time))) post.comment_counts = 0 post.content = unescape(post.content) hot = _get_hot() tags = _get_tags() _store_visitors(pid) return render_template('detail.html', title=post.title, p=post, pre_post=pre_post, next_post=next_post, hot=hot, tags=tags)
def _post_snapshots(self, post): link_list = "" this_post = "" logging.debug("Fetching archive link for submission {0}: {1}".format(post.id, "http://redd.it/" + post.id)) try: if post.is_self and post.selftext_html is not None: soup = BeautifulSoup(unescape(post.selftext_html)) for anchor in soup.find_all('a'): url = anchor['href'] netloc = urllib.parse.urlparse(url)[1] if netloc == '': netloc = 'reddit.com' url = "http://www.reddit.com" + urllib.parse.urlparse(url)[2] if netloc in self.config['domains'] or 'all' in self.config['domains']: archive_link = self._get_archive_url(self._fix_reddit_url(url)) link_list += "* [{0}...]({1})\n\n".format(anchor.contents[0][0:randint(35, 40)], archive_link) elif not post.is_self: archive_link = self._get_archive_url(self._fix_reddit_url(post.url)) link_list = "* [Link]({0})\n".format(archive_link) this_post = self._get_archive_url("http://redd.it/" + post.id) except KeyboardInterrupt as e: logging.error("Error fetching archive link on submission {0}: {1}".format(post.id, "http://redd.it/" + post.id)) logging.error(str(e)) pass quote = self._get_quote() try: if not post.archived: logging.info("Posting snapshot on submission {0}: {1}".format(post.id, "http://redd.it/" + post.id)) post.add_comment(self.post_comment.format(quote=quote, this_post=this_post, links=link_list, subreddit=self.config['bot_subreddit'])) self.post_archive.add(post.id) except Exception as e: logging.error("Error adding comment on submission {0}: {1}" .format(post.id, "http://redd.it/" + post.id)) logging.error(str(e))
def process(self): report = self.receive_message() raw_report = utils.base64_decode(report.get("raw")) raw_report_splitted = raw_report.split("</tr>")[2:] parser = html.parser.HTMLParser() for row in raw_report_splitted: event = Event(report) row = row.strip() if len(row) <= 0: continue info = row.split("<td>") if len(info) < 3: continue ip = info[1].split('</td>')[0].strip() last_seen = info[2].split('</td>')[0].strip() + '-05:00' description = parser.unescape(info[3].split('</td>')[0].strip()) for key in ClassificationType.allowed_values: if description.lower().find(key.lower()) > -1: event.add("classification.type", key) break else: for key, value in TAXONOMY.items(): if description.lower().find(key.lower()) > -1: event.add("classification.type", value) break if not event.contains("classification.type"): event.add("classification.type", 'unknown') event.add("time.source", last_seen) event.add("source.ip", ip) event.add("event_description.text", description) event.add("raw", row) self.send_message(event) self.acknowledge_message()
def __init__(self, url, numCols, extractionMap, exceptions): # Request the html. request = urllib.request.Request(url) request.add_header("User-Agent",self.user_agent) try: response = urllib.request.urlopen(request) except: print("Error: Invalid URL. Exiting.") exit() htmlContent = response.read().decode("utf8") # Some files have <br> in the middle of a <td> tag, # and cause the parser to misinterpret the data. htmlContent = htmlContent.replace("<br>", "") # Parse the html. parser = CountryParser(numCols, extractionMap, exceptions, strict=False) htmlContent = parser.unescape(htmlContent) # Unescape HTML entities. parser.feed(htmlContent) parser.close() self.__countryData = parser.countryData
def get_skills(cls): r = s.get('http://bddatabase.net/query.php?a=skills&type=%s&l=us' % cls) data = [] for entry in r.json()['aaData']: id = int(entry[0]) original_name = bold_pattern.search(entry[2]).group(1) level = int(entry[3]) name = parser.unescape(original_name).strip() name = name \ .replace(': ', '_') \ .replace(' - ', "_") \ .replace("'", "") \ .replace(' ', "_") \ .upper() name = re.sub(r'[^a-zA-Z\d\s_]', '', name) skill_number_match = skill_number_pattern.search(name) if skill_number_match: skill_number = skill_number_match.group()[1:] try: skill_number = int(skill_number) except: try: skill_number = roman.fromRoman(skill_number) except: print(repr(original_name)) raise name_without_skill_number = name[:-(skill_number_match.end() - skill_number_match.start())] else: skill_number = None name_without_skill_number = name print('%s id %i skillnumber %s' % (name_without_skill_number.encode('cp850', errors='replace'), id, skill_number)) data.append([name, name_without_skill_number, id, skill_number]) return data
def parse_playlist(playlist): data = json.loads(playlist) if not data['status']: return [] # trackList would be `null` if no tracks track_list = data['data']['trackList'] if not track_list: return [] parser = html.parser.HTMLParser() return [ { key: parser.unescape(track[key]) for key in [ 'title', 'location', 'lyric', 'pic', 'artist', 'album_name', 'song_id', 'album_id' ] } for track in track_list ]
def parse_output(self, line): # Capture SKOOTs if line.find('SKOOT') != -1: self.parse_skoot(line) else: parser = html.parser.HTMLParser() line = parser.unescape(line) # Before we nuke the HTML closing tags, decide if we need to un-nest some lists. if self.list_depth > 0: self.list_depth -= line.count('</ul>') # pprint('List depth now lowered to: ' + str(self.list_depth)) line = re.sub(r"</.*?>", "", line) tags = [] self.draw_output("\n") # line is now a string with HTML opening tags. # Each tag should delineate segment of the string so that if removed the resulting string # would be the output line. # It can be a subset of (antiquated) HTML tags: # center, font, hr, ul, li, pre, b pattern = re.compile(r'<(.*?)>') segments = pattern.split(line) if segments.__len__() > 1: for segment in segments: segment = segment.strip('<>') # Not sure if more Pythonic to do this or a dictionary of functions if re.search(r'thinks aloud:', segment): # Just a thought, print it! self.draw_output('<' + segment + '>', tuple(tags)) elif re.match(r'font', segment): # Handle font changes # So far I know of size and color attributes. color = re.match(r'font color="(#[0-9a-fA-F]{6})"', segment) if color: color = color.group(1) self.output_panel.tag_configure(color, foreground=color, font=self.output_panel.cget("font")) tags.append(color) # @todo Handle sizes elif re.match(r'hr', segment): i = 0 line = '' while i < self.line_length: line += '-' i += 1 self.draw_output(line, 'center') elif re.match(r'pre', segment): # For now, we're just handling this as centered because our font is already fixed width. tags.append('center') elif re.match(r'center', segment): tags.append('center') elif re.match(r'b', segment): tags.append('bold') elif re.match(r'ul', segment): self.list_depth += 1 # pprint('List depth now raised to: ' + str(self.list_depth)) segment.replace('ul', '') if re.match(r'li', segment): segment = segment.replace('li', self.draw_tabs() + "* ") self.draw_output(segment, tuple(tags)) elif re.match(r'li', segment): segment = segment.replace('li', self.draw_tabs() + "* ") self.draw_output(segment, tuple(tags)) else: # Not a special segment self.draw_output(segment, tuple(tags)) else: self.draw_output(line, None)
s.auth = HttpNtlmAuth('INTRA\\ruijie.yang', pw, s) ret01 = s.get(url06, params=payload01) if '<title>Working...' in ret01.text: break else: print('Authentication failed, please try again') # ret01 = s.get(url02) # payload01['wctx'] = ret01.text.split('wctx=')[1].split('\\u0026')[0] # ret01 = s.get(url01) # print(ret01.text) # if '[200]' in ret01.text: # break # else: # print('Authentication failed, please try again') # payload02['wctx'] = payload01['wctx'] payload02['wresult'] = parser.unescape(ret01.text.split('name="wresult" value="')[1].split('" />')[0]) payload02['wctx'] = parser.unescape(ret01.text.split('name="wctx" value="')[1].split('" />')[0]) ret02 = s.post(url02, data=payload02) # print(ret02.text) payload03['t'] = ret02.text.split('value="')[1].split('">')[0] url03 = ret02.text.split('action="')[1].split('" ')[0] ret03 = s.post(url03, data=payload03) payload04['t'] = ret03.text.split('value="')[1].split('">')[0] ret04 = s.post(url04, data=payload04) ret9 = s.get(url9) payload05['t'] = ret9.text.split('value="')[1].split('">')[0] ret9 = s.post(url05, data=payload05) lines = ret9.text.replace('\r\n', '\n').split('\n') parseDish(s, 9, lines, dishes, id2pic)
def format_text(text): return parser.unescape(text).strip()
def main(): """main function""" if len(sys.argv) < 2: print("Usage: auto script") exit(1) payload01 = { "username": "******", "wa": "wsignin1.0", "wtrealm": "urn:federation:MicrosoftOnline", "popupui": "", } payload02 = {"wa": "wsignin1.0"} payload03 = {"wa": "wsignin1.0"} payload04 = {} payload05 = {} pw = getpass.getpass() dishes = [] id2pic = {} parser = html.parser.HTMLParser() with requests.session() as sess: sess.auth = HttpNtlmAuth("INTRA\\ruijie.yang", pw, sess) sess.headers.update( { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "en-US,en;q=0.8,ja;q=0.6,zh-CN;q=0.4,zh;q=0.2", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Content-Type": "application/x-www-form-urlencoded", "Upgrade-Insecure-Requests": "1", } ) # ret01 = sess.get(URL02) # payload01['wctx'] = ret01.text.split('wctx=')[1].split('\\u0026')[0] ret01 = sess.get(URL01) print(ret01.text) payload01[ "wctx" ] = "estsredirect=2&estsrequest=rQIIAbPSySgpKSi20tcvyC8qSczRy81MLsovzk8ryc_LycxL1UvOz9XLL0rPTAGxioS4BBYVHnL4-T7Qcc-kppq1-SXbVjEqEzZC_wIj4wtGxltMgv5F6Z4p4cVuqSmpRYklmfl5j5h4Q4tTi_zzcipD8rNT8yYx8-Xkp2fmxRcXpcWn5eSXAwWAJhQkJpfEl2QmZ6eW7GJWSTYySDEwSkvSTTQ2T9M1MTU30LUwMjXXNUlLNrBINkhMTTZJu8AicICTEQA1" # payload02['wctx'] = payload01['wctx'] ret01 = sess.get(URL06, params=payload01) payload02["wresult"] = parser.unescape(ret01.text.split('name="wresult" value="')[1].split('" />')[0]) payload02["wctx"] = parser.unescape(ret01.text.split('name="wctx" value="')[1].split('" />')[0]) ret02 = sess.post(URL02, data=payload02) # print(ret02.text) payload03["t"] = ret02.text.split('value="')[1].split('">')[0] ret03 = sess.post(URL03, data=payload03) payload04["t"] = ret03.text.split('value="')[1].split('">')[0] ret04 = sess.post(URL04, data=payload04) ret9 = sess.get(URL9) payload05["t"] = ret9.text.split('value="')[1].split('">')[0] ret9 = sess.post(URL05, data=payload05) lines = ret9.text.replace("\r\n", "\n").split("\n") parse_dish(sess, 9, lines, dishes, id2pic) ret22 = sess.get(URL22) lines = ret22.text.replace("\r\n", "\n").split("\n") parse_dish(sess, 22, lines, dishes, id2pic) download_pics(sess, dishes, id2pic) # for x in INGD_LST: # ingdfile = sess.get(URL_INGD % x, stream=True) # with open('static/images/%s.jpg' % x, 'wb') as out_file: # shutil.copyfileobj(ingdfile.raw, out_file) # del ingdfile dish_json = json.dumps([vars(x) for x in dishes]) # print(dish_json) # print(id2pic) ret = requests.post(URL_UPDATE, data=dish_json) print(ret.text)
def unescape(text): if (sys.version_info[0] < 3): parser = HTMLParser.HTMLParser() else: parser = html.parser.HTMLParser() return (parser.unescape(text))
def unescape(text): parser = html.parser.HTMLParser() return (parser.unescape(text))
homoplasy += sum(C.values()) print(idx+1, pr[1], homoplasy, sum(C.values())) print('TOTAL', '{0:.2}'.format(homoplasy / len(data['protos']))) with open('R_sound-change-frequencies-'+matrix+'.tsv', 'w') as f: f.write('SOURCE\tTARGET\tFREQUENCY\n') for (s,t),v in sorted(H.items(), key=lambda x: x[1], reverse=True): f.write('{0}\t{1}\t{2}\n'.format(s,t,v)) G = nx.DiGraph() for a,b in H: G.add_edge(a,b,weight=H[a,b]) nx.write_gml(G,'.tmp.gml') tmp = open('.tmp.gml').read() with open('R_scf-'+matrix+'.gml', 'w') as f: f.write(parser.unescape(tmp)) if 'proto' in argv: if not tree: raise ValueError("No tree specified!") C = {} for idx,(p,m,c,pr) in enumerate(zip(data['patterns'], data[matrix], data['chars'] if matrix != 'fitch' else data['fitch.chars'], data['protos'])): w,p,r = sankoff_parsimony( p, data['taxa'], tree,
# check statistics, for example, get C = G.community_infomap( edge_weights = 'woccurrence', vertex_weights = 'occurrence' ) for community,name in zip(C.membership, G.vs['name']): _G.node[name]['infomap'] = community print('[i] Calculated communities for rhyme words.') from html import parser nx.write_gml(N, 'R_rime_transitions.gml') with open('R_rime_transitions.gml') as f: _t = f.read() with open('R_rime_transitions.gml','w') as f: f.write(parser.unescape(_t)) nx.write_gml(_G, 'R_infomap.gml') with open('R_infomap.gml') as f: _t = f.read() with open('R_infomap.gml','w') as f: f.write(parser.unescape(_t)) nx.write_yaml(_G, 'R_infomap.yaml')
def unescape(message): parser = html.parser.HTMLParser() return parser.unescape(message)
self.lyric_data = self.getpos() self.start_lyric_set = False self.lyric_data_set = True def handle_endtag(self, tag): if self.lyric_data_set: self.lyric_data_set = False self.end_lyric = self.getpos() if __name__ == "__main__": page = urllib.request.urlopen("http://lyrics.wikia.com/Radiohead:Separator") parser = LyricParser() giant_html_string = "" for line in page: giant_html_string += parser.unescape((fix(line))) parser.feed(giant_html_string) print(parser.start_lyric) print(parser.lyric_data) print(parser.end_lyric) # print(giant_html_string[parser.start_lyric[0]:parser.lyric_data[0]]) i = 0 for line in giant_html_string.split("\n"): if i >= parser.start_lyric[0] and i <= parser.lyric_data[0]: print(line) i += 1 print(i) # page.seek(parser.start_lyric[0])