def gen_slogan(msg): html = get('http://www.sloganizer.net/en/outbound.php', params={'slogan': msg}) slogan = re.search('>(.*)<', html.text).group(1) parser = HTMLParser() slogan = parser.unescape(parser.unescape(slogan)) slogan = slogan.replace('\\', '').strip() return slogan if slogan else gen_slogan(msg)
def parse_videos_from_feed(): """ Injest MRSS feed into local scope; format videos to FB upload spec """ data = feedparser.parse(os.getenv('MTFV_MRSS_URL')) h = HTMLParser() videos = [] for video in data.entries: if get_value(video['guid']): continue formatted_video = { 'title': h.unescape(video['title']), 'description': h.unescape(video['summary']), 'guid': video['guid'], 'file_url': video['media_content'][0]['url'], 'file_size': video['media_content'][0]['filesize'], 'thumb_url': video['media_thumbnail'][0]['url'] } if os.getenv('MRSS_SCHEDULED_DATETIME_ELEMENT') and video[os.getenv('MRSS_SCHEDULED_DATETIME_ELEMENT')]: formatted_video['published'] = 0 formatted_video['unpublished_content_type'] = 'SCHEDULED' datetime = parse(video[os.getenv('MRSS_SCHEDULED_DATETIME_ELEMENT')]) formatted_video['scheduled_publish_time'] = datetime.strftime('%s') videos.append(formatted_video) return videos
def parseHTML(self,string,timezoneoffset): h = HTMLParser() anime_showtimes = {} start = False for line in string: try: if not line.startswith('<h2 class="weekday">') and not start: pass elif re.search(r'<td class="schedule-page-show">.*>(.*)</a>', line) != None: title = h.unescape(re.search(r'<td class="schedule-page-show">.*>(.*)</a>', line).group(1)) elif re.search(r'<td class="schedule-show">(.*)</td>', line) != None: title = h.unescape(re.search(r'<td class="schedule-show">(.*)</td>', line).group(1)) elif re.search(r'<h2 class="weekday">(.*)</h2>',line) != None: weekday = h.unescape(re.search(r'<h2 class="weekday">(.*)</h2>',line).group(1)) if weekday == 'To be scheduled': break elif not start: start = True for i,a in enumerate(['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']): if weekday == a: weekday = i break elif re.search(r'<td class="schedule-time">(\d\d:\d\d)</td>',line) != None: time = h.unescape(re.search(r'<td class="schedule-time">(\d\d:\d\d)</td>',line).group(1)) time_object = datetime.datetime(2016,5,8+weekday,int(time[0:2]),int(time[3:5])) + datetime.timedelta(hours=int(timezoneoffset)) anime_showtimes[title.lower()] = [time_object.isoweekday(),time_object.hour,time_object.minute,title] except Exception as e: print(str(e)) return anime_showtimes
def handle_task(self, job): user = job.get("user", "root") group = job.get("group", "root") mail = job.get("sender", None) account = Account(user=user, group=group, mail=mail) recipients = job.get("recipients", None) subject = ensure_unicode(job.get("subject", "")) body = ensure_unicode(job.get("body", "")) attachments = job.get("attachments", None) smtp_host = job.get("smtp_host", "localhost") smtp_port = job.get("smtp_port", 25) html = job.get("html", False) template_data = job.get("jobctx", {}) body = Template(body)(template_data) subject = Template(subject)(template_data) if not html: h = HTMLParser() body = h.unescape(body) subject = h.unescape(subject) # Execute the task return self.sendmail(account, recipients, subject, body, attachments, smtp_host, smtp_port, html)
def handle_task(self, job): user = job.get('user', 'root') group = job.get('group', 'root') mail = job.get('sender', None) account = Account(user=user, group=group, mail=mail) recipients = job.get('recipients', None) subject = ensure_unicode(job.get('subject', '')) body = ensure_unicode(job.get('body', '')) attachments = job.get('attachments', None) smtp_host = job.get('smtp_host', 'localhost') smtp_port = job.get('smtp_port', 25) html = job.get('html', False) template_data = job.get('jobctx', {}) body = Template(body)(template_data) subject = Template(subject)(template_data) if not html: h = HTMLParser() body = h.unescape(body) subject = h.unescape(subject) # Execute the task return self.sendmail( account, recipients, subject, body, attachments, smtp_host, smtp_port, html)
def _get_track_name(self, t_data): html_parser = HTMLParser() full_name = "{0} - {1}".format( html_parser.unescape(t_data['artist'])[:50].strip(), html_parser.unescape(t_data['title'])[:50].strip(), ) full_name = re.sub('[' + FORBIDDEN_CHARS + ']', "", full_name) full_name = re.sub(' +', ' ', full_name) return full_name
def get_quiz(): h = HTMLParser() random.seed() quiz_type = random.choice([0, 1]) quiz_up = from_group(1) + from_group(2) + from_group(3) random.shuffle(quiz_up) parent_body = lambda c: h.unescape(c.parent.body_html) if c.parent != None else c.submission.title p_b = lambda c: parent_body(c) if quiz_type == 1 else '' t_d = lambda c: str(c.created_utc - c.submission.created_utc) if quiz_type == 1 else '' quiz = {'quiz': [{'body': h.unescape(c.body_html), 'comment_id': c.c_id, 'parent_body': p_b(c), 'time_diff': t_d(c)} for c in quiz_up], 'type': quiz_type} return json.dumps(quiz)
def get_track_full_name(data): """ Gets track full name and convert it to string like 'artist_track.mp3' """ forbidden_symbols = ',!.;/' html_parser = HTMLParser() full_name = u"{0}_{1}".format( html_parser.unescape(data['artist'][:100]).strip(), html_parser.unescape(data['title'][:100]).strip(), ) full_name = ''.join(c for c in full_name if not c in forbidden_symbols) return full_name + ".mp3"
def editf(): fn=request.args.get('filename') if os.path.exists('mdfile/'+fn)==False: editer={"filename":fn,"content":"# New_File.md"} h=HTMLParser() return h.unescape(render_template("tmpl/model/edit.html",editer = editer)) else: f=open('mdfile/'+fn) tx=f.read() f.close() editer={"filename":fn,"content":tx} h=HTMLParser() return h.unescape(render_template("tmpl/model/edit.html",editer = editer))
def gowiki(): nm=request.args.get('file') if os.path.exists('mdfile/'+nm)==False: wiki={'title':"Page Not Found",'content':"# Page Not Found"} h=HTMLParser() return h.unescape(render_template("tmpl/model/model.html",wiki = wiki)) else: f=open('mdfile/'+nm) ct=f.read() f.close() wiki={'title':nm,'content':ct} h=HTMLParser() return h.unescape(render_template("tmpl/model/model.html",wiki = wiki))
def _parse_article(self, div): self.article = Article() parser = HTMLParser() for tag in div: if not hasattr(tag, 'name'): continue if tag.name == 'div' and self._tag_has_class(tag, 'gs_ri'): rt = tag.find('h3', {'class': 'gs_rt'}) if rt: ctu = rt.find('span') if ctu: ctu.extract() self.article['title'] = parser.unescape(''.join(rt.findAll(text=True)).strip()) if rt.a: self.article['url'] = self._path2url(rt.a['href']) if tag.find('div', {'class': 'gs_a'}): year = self.year_re.findall(tag.find('div', {'class': 'gs_a'}).text) self.article['year'] = year[0] if len(year) > 0 else None if tag.find('div', {'class': 'gs_fl'}): self._parse_links(tag.find('div', {'class': 'gs_fl'})) if tag.find('div', {'class': 'gs_rs'}): self.article['summary'] = tag.find('div', {'class': 'gs_rs'}).text if self.article['title']: self.handle_article(self.article)
def get_game_list (system): """List all the games on Guardiana for a given system.""" response = urllib.request.urlopen ("http://www.guardiana.net/MDG-Database/Complete-List/" + system + "/") doc = response.read () soup = BeautifulSoup(doc) html_game_list = soup.find("div", {"id": "MDGD_FullList_Box"}) game_list = re.findall ("""» <a href="(.+?)">(.+?)</a><br/>(?:<em>)?(.*?)(?:</em>)?<br/>""", str (html_game_list)) game_dict_list = [] for game in game_list: game_dict = {'url': "http://www.guardiana.net" + game[0], 'title': [ ]} # Clean up the URL and add it result = re.search ("(.*?)\?PHPSESSID=.*?", game[0]) if result: game_dict['url'] = "http://www.guardiana.net" + result.group(1) else: game_dict['url'] = "http://www.guardiana.net" + game[0] # Unescape the HTML entities from titles and add them pars = HTMLParser() game_dict['title'].append (pars.unescape (game[1])) game_dict_list.append (game_dict) return game_dict_list
def linksh(self, cli, ev): try: self.chancache[ev.target.lower()] except: return 1 if self.yt is True: yr = re.compile(".*(youtube\.com\/watch\?.*v=|youtu\.be\/)([A-Za-z" "0-9._%-]*)[&\w;=\+_\-]*.*") res = yr.search(ev.arguments[0]) if res is not None: self.ytlinks(cli, ev, res) return 0 url = re.compile("((https?):((\/\/)|(\\\\))+[\w\d:#@%\/;$()~_?\+-=\\\." "&]*)") res = url.search(ev.arguments[0]) if res is None: return 1 uri = res.group(1) r = urllib.request.urlopen(uri).read().decode('utf-8', 'replace') parser = HTMLParser() r = parser.unescape(r) yr = re.compile(".*<title[^>]*>([^<]+)</title>.*") title = yr.search(r) if title is None: return 1 cli.msg(ev.target, title.group(1))
def _get_springer_journal_stats(journal_id, period, oa=False): if not journal_id.isdigit(): raise ValueError("Invalid journal id " + journal_id + " (not a number)") url = SPRINGER_FULL_SEARCH.format(journal_id, period, period) if oa: url = SPRINGER_OA_SEARCH.format(journal_id, period, period) print(url) try: req = Request(url, None) response = urlopen(req) content = response.read() content = content.decode("utf-8") results = {} except HTTPError as httpe: if httpe.code == 503: # retry on timeout print(colorise("Timeout (HTTP 503), retrying...", "yellow")) return _get_springer_journal_stats(journal_id, period, oa) else: raise httpe count_match = SEARCH_RESULTS_COUNT_RE.search(content) if count_match: count = count_match.groupdict()['count'] count = count.replace(",", "") results['count'] = int(count) else: raise ValueError("Regex could not detect a results count at " + url) title_match = SEARCH_RESULTS_TITLE_RE.search(content) if title_match: title = (title_match.groupdict()['title']) htmlparser = HTMLParser() results['title'] = htmlparser.unescape(title) else: raise ValueError("Regex could not detect a journal title at " + url) return results
def publishPost(self, post, link, comment): logging.info(" Publishing in Telegram...") bot = self.tc title = post content = comment links = "" channel = self.channel from html.parser import HTMLParser h = HTMLParser() title = h.unescape(title) text = '<a href="'+link+'">'+title+ "</a>\n" + content + '\n\n' + links textToPublish2 = "" if len(text) < 4090: textToPublish = text links = "" else: text = '<a href="'+link+'">'+title + "</a>\n" + content textToPublish = text[:4080] + ' ...' textToPublish2 = '... '+ text[4081:] logging.info("text to "+ textToPublish) logging.info("text to 2"+ textToPublish2) bot.sendMessage('@'+channel, textToPublish, parse_mode='HTML') if textToPublish2: try: bot.sendMessage('@'+channel, textToPublish2[:4090], parse_mode='HTML') except: bot.sendMessage('@'+channel, "Text is longer", parse_mode='HTML') if links: bot.sendMessage('@'+channel, links, parse_mode='HTML')
def forwards(apps, schema_editor): html_parser = HTMLParser() for cascade_element in CascadeElement.objects.all(): if cascade_element.plugin_type != 'CarouselSlidePlugin': continue caption = cascade_element.glossary.get('caption') if not caption: continue text_element = add_plugin(cascade_element.placeholder, TextPlugin, cascade_element.language, target=cascade_element) old_body = html_parser.unescape(caption) new_body, count = _replace_text_body( old_body, input_pattern=r'<img ([^>]*)\bid="plugin_obj_(?P<pk>\d+)"([^>]*)/?>', output_tag='<cms-plugin {}></cms-plugin>', id_format='id="{}"', ) text_element.body = new_body text_element.save() # TODO: need to be re-tested if False and count > 0: for link_element in CMSPlugin.objects.filter(parent_id__in=(cascade_element.id, cascade_element.parent_id), plugin_type='TextLinkPlugin'): # print("Move Link {} from {} -> {}".format(link_element.id, link_element.parent_id, text_element.id)) link_element.move(text_element, pos='last-child') link_element.save()
def get_images(current_title, title, titles_length): h = HTMLParser() print("Fetching images from %s... (%s/%s)" % (title, current_title + 1, titles_length)) # Escape the title so we can create a valid link # title = title.replace('\'', '%27').replace(' ', '%20') # Repition is succes while True: try: page = urlopen(SOURCE_LOCATION % title).read().decode(ENCODING) break except IOError: print("\tServer's being lazy, retrying...") if not page: print("\tFailed to get %s's images!" % title) return [] # Ignore redirects if search("#DOORVERWIJZING", page, I | M) is not None or search("#REDIRECT.*", page, I | M) is not None: print("\tSkipping redirecting page %s" % title) return [] imagelinks = [] parser = ImageLocater(imagelinks) page = h.unescape(page) try: parser.feed(page) except: print("%s is a malformatted page" % title) return [] return imagelinks
def extract(self): ms = re.findall(r"<title>(.*)\s-\sYouTube</title>", self.data) p = HTMLParser() if ms: return p.unescape(ms[0]) else: return ""
def group_search(keywords, cookie): ''' парсер групп по ключевым словам, лол ''' from html.parser import HTMLParser parser = HTMLParser() s = '' for word in keywords: s += word + ' ' site = 'http://vk.com/al_groups.php'#поиск группы # site='http://vk.com/al_video.php'# а вот там лежит хэш для видосов post = {'act':'server_search', 'al':'1', 'q':s}#волшебный пост # post={'act':'show','al':'1','module':'vieo','video':'100457938_162516488'} data = req.post(site,post) html = parser.unescape(data.text) # print(html) # sys.exit html_pre = html.strip().splitlines() groups = [] line = 'd' group_stat = collections.namedtuple('group_stat', ['path', 'name', 'num']) nstr = 0 for line in html_pre: line = line.lstrip() if line.lstrip().startswith('<div class="group_row_labeled"><a href='): #еще немного волшебства temp1 = re.search(r'(?<=<div class="group_row_labeled"><a href=")/\w+', line).group() temp2 = re.sub(r'<.+?>', '', line) nstr = 1 elif nstr == 1: nstr = 2 elif nstr == 2: groups.append(group_stat(temp1, temp2, re.search(r'\d+', line).group())) nstr = 0 return groups
def __parseResultsArea1(resultsArea): """ Parses <div id="resultsArea">...</div> from Bing! history page Returns a list of queries (can be empty list) """ startMarker = '<span class="query_t">' startMarkerLen = len(startMarker) history = [] htmlParser = HTMLParser() s = 0 while True: s = resultsArea.find(startMarker, s) if s == -1: break # locate a query s += startMarkerLen s = resultsArea.index("<a ", s) s += 3 s = resultsArea.index(">", s) s += 1 e = resultsArea.index("</a>", s) # resultsArea[s:e] now contains a query from history history.append(htmlParser.unescape(resultsArea[s:e]).strip()) s = e + 4 return history
def __parseResultsArea2(resultsArea): """ Parses results from Bing! history page Returns a list of queries (can be empty list) """ startMarker = '<span class="sh_item_qu_query">' startMarkerLen = len(startMarker) history = [] htmlParser = HTMLParser() s = 0 while True: s = resultsArea.find(startMarker, s) if s == -1: break # locate a query s += startMarkerLen e = resultsArea.index("</span>", s) # resultsArea[s:e] now contains a query from history history.append(htmlParser.unescape(resultsArea[s:e]).strip()) s = e + 7 return history
def check_gplay(app): time.sleep(15) url = 'https://play.google.com/store/apps/details?id=' + app.id headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:18.0) Gecko/20100101 Firefox/18.0'} req = urllib.request.Request(url, None, headers) try: resp = urllib.request.urlopen(req, None, 20) page = resp.read() except urllib.error.HTTPError as e: return (None, str(e.code)) except Exception as e: return (None, 'Failed:' + str(e)) version = None m = re.search('itemprop="softwareVersion">[ ]*([^<]+)[ ]*</div>', page) if m: html_parser = HTMLParser() version = html_parser.unescape(m.group(1)) if version == 'Varies with device': return (None, 'Device-variable version, cannot use this method') if not version: return (None, "Couldn't find version") return (version.strip(), None)
def twitch_lookup(location): locsplit = location.split("/") if len(locsplit) > 1 and len(locsplit) == 3: channel = locsplit[0] type = locsplit[1] # should be b or c id = locsplit[2] else: channel = locsplit[0] type = None id = None h = HTMLParser() fmt = "{}: {} playing {} ({})" # Title: nickname playing Game (x views) if type and id: if type == "b": # I haven't found an API to retrieve broadcast info soup = http.get_soup("http://twitch.tv/" + location) title = soup.find('span', {'class': 'real_title js-title'}).text playing = soup.find('a', {'class': 'game js-game'}).text views = soup.find('span', {'id': 'views-count'}).text + " view" views = views + "s" if not views[0:2] == "1 " else views return h.unescape(fmt.format(title, channel, playing, views)) elif type == "c": data = http.get_json("https://api.twitch.tv/kraken/videos/" + type + id) title = data['title'] playing = data['game'] views = str(data['views']) + " view" views = views + "s" if not views[0:2] == "1 " else views return h.unescape(fmt.format(title, channel, playing, views)) else: data = http.get_json("http://api.justin.tv/api/stream/list.json?channel=" + channel) if data and len(data) >= 1: data = data[0] title = data['title'] playing = data['meta_game'] viewers = "\x033\x02Online now!\x02\x0f " + str(data["channel_count"]) + " viewer" print(viewers) viewers = viewers + "s" if not " 1 view" in viewers else viewers print(viewers) return h.unescape(fmt.format(title, channel, playing, viewers)) else: try: data = http.get_json("https://api.twitch.tv/kraken/channels/" + channel) except: return title = data['status'] playing = data['game'] viewers = "\x034\x02Offline\x02\x0f" return h.unescape(fmt.format(title, channel, playing, viewers))
def downloaded_to_intermediate(self, basefile, attachment=None): # Check to see if this might not be a proper SFS at all # (from time to time, other agencies publish their stuff # in SFS - this seems to be handled by giving those # documents a SFS nummer on the form "N1992:31". Filter # these out. if basefile.startswith('N'): raise IckeSFS("%s is not a regular SFS" % basefile) filename = self.store.downloaded_path(basefile) try: t = TextReader(filename, encoding=self.source_encoding) except IOError: self.log.warning("%s: Fulltext is missing" % basefile) # FIXME: This code needs to be rewritten baseuri = self.canonical_uri(basefile) if baseuri in registry: title = registry[baseuri].value(URIRef(baseuri), self.ns['dcterms'].title) desc.value(self.ns['dcterms'].title, title) desc.rel(self.ns['dcterms'].publisher, self.lookup_resource("Regeringskansliet")) desc.value(self.ns['dcterms'].identifier, "SFS " + basefile) doc.body = Forfattning([Stycke(['Lagtext saknas'], id='S1')]) # Check to see if the Författning has been revoked (using # plain fast string searching, no fancy HTML parsing and # traversing) if not self.config.keepexpired: try: t.cuepast('<i>Författningen är upphävd/skall upphävas: ') datestr = t.readto('</i></b>') if datetime.strptime(datestr, '%Y-%m-%d') < datetime.today(): self.log.debug('%s: Expired' % basefile) raise UpphavdForfattning("%s is an expired SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) t.seek(0) except IOError: t.seek(0) t.cuepast('<pre>') # remove ä et al try: # this is the preferred way from py34 onwards. FIXME: Move # this to ferenda.compat import html txt = html.unescape(t.readto('</pre>')) except ImportError: # this is the old way. hp = HTMLParser() txt = hp.unescape(t.readto('</pre>')) if '\r\n' not in txt: txt = txt.replace('\n', '\r\n') re_tags = re.compile("</?\w{1,3}>") txt = re_tags.sub('', txt) # add ending CRLF aids with producing better diffs txt += "\r\n" util.writefile(self.store.intermediate_path(basefile), txt, encoding=self.source_encoding) return codecs.open(self.store.intermediate_path(basefile), encoding=self.source_encoding)
def index(): response = requests.get( 'http://oraalu.fe.up.pt:8888/aulas/WEB_DATA.json_alus', auth=HTTPBasicAuth('ei12060', 'Vitor')) h = HTMLParser() document = json.loads(h.unescape(response.text)) alus.insert(document) return "OK"
def get_entries(self): entries = [] try: feed = self.get_feed() entries = feed["entries"] except Exception: # clear the cache so we try again key = RSS_JOBS_KEY % (self.org.id, self.id) cache.delete(key) pass html_parser = HTMLParser() for entry in entries: summary = entry.get("summary", "") entry["summary"] = strip_tags(html_parser.unescape(html_parser.unescape(summary))) return entries
def fetchTitle(url): page = ur.urlopen(ur.Request(url, data=None, headers={'User-Agent': "Mozilla/5.0 Python-urllib/2.6 EqBot"})) enc = page.info().get("Content-Type").partition("=")[2] if enc == "": enc = "utf-8" try: return " ".join(HTMLParser.unescape(HTMLParser, re.search(b"<title>(.*?)</title>", page.read(10000).translate(None,b"\n\r\t"), flags = re.I | re.M).expand(b"\\1").decode("utf-8")).split()) except (AttributeError, HTTPError, URLError): return None
def handle_link_posted(self, data, match): if settings.POST_URLS.get(data["target"]) is None: self.logger.debug( 'No POST_URLS in the settings for this channel. Cannot post.') return None if settings.POST_URLS_SALTS.get(data['target']) is None: self.logger.debug( 'No security token for this channel. Cannot post.') return None url = match.group(1) # text after the url is assumed to be a title suggested_title = match.group(3) submission_salt = settings.POST_URLS_SALTS[data['target']] payload = { "url": url, "person": data['nick'], "submission_salt": submission_salt, } if suggested_title: payload["title"] = suggested_title post_url = settings.POST_URLS[data["target"]] response = requests.post(post_url, data=json.dumps(payload), headers={'content-type': 'application/json'}) if response.status_code != 200: self.send('Link Scraper Error: {}'.format(response.text), data) self.logger.debug('Posted {url} to {post_url}. Response was {text}. Response code was {code}'.format( code=response.status_code, url=url, text=response.text, post_url=post_url)) if settings.POST_URL_TITLES and \ settings.POST_URL_TITLES.get(data["target"]): head = requests.head(url) content_type = head.headers['content-type'].split(' ')[0].strip(';') if content_type == 'text/html': request = requests.get(url, headers={ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36' }) soup = BeautifulSoup(request.text) if soup.title is not None: parser = HTMLParser() title = soup.title.string title = parser.unescape(title) title = title.strip() # kill newlines and whitespace... self.send('Link title: {}'.format(title), data)
def get_body(text): blog_page = html.fromstring(text) result = blog_page.cssselect('#blogContent') if result: blogcontent_div = blog_page.cssselect('#blogContent')[0] h = HTMLParser() body_text = h.unescape(etree.tostring(blogcontent_div).decode('utf-8')) return body_text else: return None
def post(self, message): h = HTMLParser() message = h.unescape(message) url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', message) self.auth = tweepy.OAuthHandler(TWITTER['consumer_key'], TWITTER['consumer_secret']) self.auth.set_access_token(TWITTER['access_token'], TWITTER['access_token_secret']) self.api = tweepy.API(self.auth) logger.debug('Posting to Twitter') self.api.update_status(status=message)
def render_html_diff(text1, text2, diff_id="diff_id"): """Render html diff between text1 and text2 text1 and text2 will be normalized""" parser = HTMLParser() normalized_space_t1 = prepare_text_spaces(text1) normalized_space_t2 = prepare_text_spaces(text2) result = htmldiff.render_html_diff(normalized_space_t1, normalized_space_t2) soup, result = format_spaces(result) soup, result = normalize_diff(soup, diff_id) soup = wrap_diff(soup, diff_id) result = tag_to_text(soup.body) result = parser.unescape(result) return soup, result
def __get_prjinfo_all__(clsWeb): #获取外出项ID,然后删除之 strUrl = "util/frametree/OpensingleXtreeAction.do?datatype=son&openid=attendance_project&rootname=项目工作&conds=projectname@like¶m=508,260&keyname=projectid" clsWeb.__set_param__(strUrl) r = requests.get(clsWeb.Referer, cookies=clsWeb.cookie) #lxml操作 html = etree.HTML(r.text) #筛选数据,14个td才是数据开始 rr = html.xpath("/descendant::script[position()=14]") nLen = len(rr) if (nLen > 1): return (False, pDict) #初步解析,分解树结构 strTemp = str(etree.tostring(rr[0])) strTemps = strTemp.split('new xyTree.NodeNormal') nLen = len(strTemps) if (nLen < 2): return (False, pDict) #中文网页编码解析对象 htmlparser = HTMLParser() #循环提取所有并解析 for i in range(1, nLen): strTemp2 = str(strTemps[i]) strTemps2 = strTemp2.split(' = ') #提取有值对象 if (len(strTemps2) > 0): #for j in range(0, len(strTemps2)): #print(strTemps2[j]) strName = clsWeb.Get_EleStr(strTemps2[0], "(\\'", "\\');") strKey = clsWeb.Get_EleStr(strTemps2[1], "\\'", "\\';") #中文转换 strName = htmlparser.unescape(strName) pDict[strName] = strKey #print(strName) #print(pDict[strName]) #保存项目字典 path = "./mySetting/Prj_Dict.csv" __write_prjinfo_all__(pDict, path) return (True, pDict)
def decryption(request): if request.method == 'POST': sender = request.POST['sender'] try: driver = webdriver.Chrome( ) # Optional argument, if not specified will search path. driver.get('https://web.whatsapp.com/') time.sleep(10) # Let the user actually see something! search_box = driver.find_element_by_css_selector( "input[title*='Search or start new chat']").click() # time.sleep(5) search_box = driver.find_element_by_css_selector( "input[title*='Search or start new chat']").send_keys(sender) #time.sleep(5) search_box = driver.find_element_by_css_selector( "span[title='%s']" % sender).click() s = "" element = driver.find_elements_by_css_selector( "span[class*='_F7Vk selectable-text invisible-space copyable-text']" ) s = element[-1].get_attribute('innerHTML') data = s.split(">") s = data[1] da = s.split("<") mess = da[0] h = HTMLParser() mes = h.unescape(mess) print() print(mes + "SSS") print() mes = Decryptmsg(mes) print("HH" + mes + "HH") return render(request, 'decrypted.html', { 'mes': mes, 'sender': sender }) except: print('NOT SEND') messages.success( request, 'Try Again.. \n Check internet connection or write the username as saved in contacts list' ) return render(request, 'decrypt.html', {}) return render(request, 'home.html', {}) else: return render(request, 'decrypt.html', {})
def __processResponse(self): self.lastresponse.raise_for_status() # -> make sure it is 200 self.lasthtml = HTMLParser.unescape(HTMLParser, self.lastresponse.text) parser = DDHtmlParser() parser.feed(self.lasthtml) self.lastdata = parser.return_data() try: contenttype = self.lastdata['meta']['http-equiv']['content-type'] # "text/html; charset=iso-8859-1" self.lastencoding = rematch('^.*charset=([^;]*).*', contenttype).group(1) log.info('content-type: ' + contenttype) log.info('encoding: ' + self.lastencoding) except Exception: pass
def all_options(self): """Returns a list of tuples of all the options in the Select. Text first, value follows. Returns: A :py:class:`list` of :py:class:`Option` """ # More reliable using javascript options = self.browser.execute_script(self.ALL_OPTIONS, self.browser.element(self)) parser = HTMLParser() return [ self.Option(normalize_space(parser.unescape(option[0])), option[1]) for option in options]
def DecodeQuery1(fileName): data = [x.strip() for x in open(fileName, "r").readlines()] query_list = [] for item in data: item = item.lower() if len(item) > 50 or len(item) < 5: continue h = HTMLParser() item = h.unescape(item) #将>或者 这种转义字符转回去 item = parse.unquote(item) #解码,就是把字符串转成gbk编码,然后把\x替换成%。如果 item, number = re.subn(r'\d+', "8", item) #正则表达式替换 item, number = re.subn(r'(http|https)://[a-zA-Z0-9\.@&/#!#\?:]+', "http://u", item) query_list.append(item) return query_list
def convert_unicode(text): """ converts unicode HTML to real Unicode """ if isPython2: h = HTMLParser() s = h.unescape(text) else: try: s = unescape(text) except Exception: # Python 3.3 and below # https://stackoverflow.com/a/2360639/2295672 s = HTMLParser().unescape(text) return s
def find_user_id(url): """ Find the user@id from flickr by loading the user-url and parsing the id (kind of hacky) """ html = urllib.request.urlopen(url).read().decode('utf-8') m = re.search(r"href=\"/services/feeds/photos_public.gne\?([^\"]+)", html) if m: h = HTMLParser() uid = h.unescape(m.group(1)) uid = uid[3:uid.index("&")] return uid else: return None
def post(self, message): h = HTMLParser() message = h.unescape(message) url = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', message) self.auth = tweepy.OAuthHandler(TWITTER['consumer_key'], TWITTER['consumer_secret']) self.auth.set_access_token(TWITTER['access_token'], TWITTER['access_token_secret']) self.api = tweepy.API(self.auth) logger.debug('Posting to Twitter') self.api.update_status(status=message)
def normalize(text): if text is None: return None if text[:5] == 'orcid': text = text.split('>')[1] h = HTMLParser() text = str(h.unescape(text)) normalizr = Normalizr(language='en') normalizations = [('replace_punctuation', { 'replacement': ' ' }), 'lower_case', ('remove_stop_words', { 'ignore_case': 'False' }), 'remove_extra_whitespaces'] text = normalizr.normalize(text, normalizations) return text
def plan_page(self, response): print(response.encoding) soup = BeautifulSoup(response.text, 'lxml', from_encoding="GB18030") last_page = int(soup.find(id=re.compile(r'LabelPageCount')).get_text()) if last_page != response.save['page']: parmas = {'__EVENTTARGET': 'GridView1$ctl23$LinkButtonNextPage', '__EVENTARGUMENT': '', } parmas['__VIEWSTATE'] = soup.find('input', {'name': '__VIEWSTATE'})['value'] parmas['__EVENTVALIDATION'] = soup.find('input', {'name': '__EVENTVALIDATION'})['value'] parmas['GridView1$ctl23$tbPage'] = str(response.save['page']) data = urlencode(parmas) temp = response.orig_url.split('&') url = temp[0] for i in temp[1:-1]: url += '&' + i print(url) url = url + '&' + str(response.save['page'] + 1) save_dict = response.save save_dict['page'] = save_dict['page'] + 1 self.crawl(url, method='POST', data=data, callback=self.plan_page, age=1, save=save_dict) content = soup('a', {'target':'_blank'}) print(soup.original_encoding) domains = {} domains[self.table_name[0]] = 'http://121.10.6.230/dggsweb/PHGSFJ/PHjsxmxzFJ.aspx?' domains[self.table_name[1]] = 'http://121.10.6.230/dggsweb/PHGSFJ/PHjsydghFJ.aspx?' domains[self.table_name[2]] = 'http://121.10.6.230/dggsweb/PHGSFJ/PHjsgcghFJ.aspx?' domains[self.table_name[9]] = 'http://121.10.6.230/dggsweb/PQGSFJ/PQszFJ.aspx?' domains[self.table_name[10]] = 'http://121.10.6.230/dggsweb/PQGSFJ/PQyslzFJ.aspx?' domains[self.table_name[11]] = 'http://121.10.6.230/dggsweb/PQGSFJ/PQyslzFJ.aspx?' for i in content: link = i['href'] h = HTMLParser() link = h.unescape(link) params = link.split('?')[1] params = params.split('&') print(params) link = '' link += quote('项目受理编号'.encode('gbk')) + '=' + quote(params[0].split('=')[1].encode('GB18030')) + '&' + quote('公示'.encode('gbk')) + '=' + quote(params[1].split('=')[1].encode('GB18030')) domain = domains[response.save['type']] link = domain + link # print(link) self.crawl(link, callback=self.content_page, save=response.save)
def search_gpu(name): index = name.find("GB") if index == -1: pass else: name = name[0:index - 2] name = name.lower() name = name.replace("asus", "") name = name.replace("nvidia", "") name = name.replace("msi", "") name = name.replace("gigabyte", "") name = name.replace("pascal", "") name = name.replace("evga", "") name = name.replace("sapphire", "") name = name.replace("zotac", "") name = name.replace("amd", "") name = name.replace("frontier edition", "") name = name.replace("gainward", "") name = name.replace("xfx", "") name = name.replace("powercolor", "") name = name.replace("intel", "") name = name.replace("pro hd", "") name = name.replace("graphics", "") name = name.replace("ati", "") name = eraseFromString(name, "(", ")") name = name.replace(" ", " ") str = "https://www.passmark.com/search/zoomsearch.php?zoom_query=" + name + " price performance" try: r = requests.get(str).text except: return "RE" r = remove_html_markup(r) h = HTMLParser() r = h.unescape(r) index2 = r.find("- Price") if index2 == -1: index2 = len(r) index = r.rfind("PassMark -", 0, index2) if index == -1: return "NF" ret = r[index + 11:index2 - 1] if len(ret) > 50: return "NF" return ret
def get_lyrics(self, song): log('%s: searching lyrics for %s - %s' % (__title__, song.artist, song.title), debug=self.DEBUG) lyrics = Lyrics(settings=self.settings) lyrics.song = song lyrics.source = __title__ lyrics.lrc = __lrc__ try: req = requests.get(self.url % (urllib.parse.quote( song.artist), urllib.parse.quote(song.title)), timeout=10) response = req.text except: return None data = json.loads(response) try: self.page = data['url'] except: return None if not self.page.endswith('action=edit'): log('%s: search url: %s' % (__title__, self.page), debug=self.DEBUG) try: req = requests.get(self.page, timeout=10) response = req.text except requests.exceptions.HTTPError as error: # strange... sometimes lyrics are returned with a 404 error if error.response.status_code == 404: response = error.response.text else: return None except: return None matchcode = re.search("class='lyricbox'>(.*?)<div", response) try: lyricscode = (matchcode.group(1)) htmlparser = HTMLParser() lyricstext = htmlparser.unescape(lyricscode).replace( '<br />', '\n') lyr = re.sub('<[^<]+?>', '', lyricstext) if LIC_TXT in lyr: return None lyrics.lyrics = lyr return lyrics except: return None else: return None
def svm_dev_clean_data(json_data): clean_json_data = json_data html_parser = HTMLParser() remove_digits = str.maketrans('', '', digits) count = 0 for obj in clean_json_data: # print('Before :' + obj['text']) obj['text'] = obj['text'].lower() obj['text'] = html_parser.unescape(obj['text']) obj['text'] = obj['text'].translate(remove_digits) obj['text'] = ' '.join(obj['text'].split()) obj['text'] = strip_links(obj['text']) # obj['text'] = strip_all_entities(obj['text']) # print('After :' + obj['text']) count += 1 return clean_json_data
def format_transcript_text(self, text): """ Prepare unicode transcripts to be converted to WebVTT format. """ new_text = [ self.format_transcript_text_line(line) for line in text[0].splitlines() ] new_text = '\n'.join(new_text) html_parser = HTMLParser() unescaped_text = html_parser.unescape(new_text) if u"WEBVTT" not in text: text = "WEBVTT\n\n" + unescaped_text else: text = unescaped_text return text
def convert_data(file_path): full_data = [] html_parser = HTMLParser() with open(file_path, "r", encoding="utf-8") as file: for line in file.readlines(): temp_data = json.loads(line) # €: alt + 0128 data = temp_data["id"] + "€" + temp_data["title"] + "€" + \ temp_data["content"].replace("\n", "") + "€" + \ str(temp_data["tags"]).strip("[").strip("]").replace("'","").replace(" ", "") data = html_parser.unescape(data).replace( "\n", "") # 为了将爬虫中HTML语言的的转义字符,如&, <, >等转回正常的符号。 full_data.append(data) print("Successfully convert data!") print("Full Data Size: ", len(full_data)) return full_data
def ogdch_localize_activity_item(msg): """localizing activity messages: this function gets an html message and replaces the language dict in there with the localized value """ parser = HTMLParser() unescaped_msg = parser.unescape(msg) language_dict_result = re.search(REGEX_LANGUAGE_DICT, unescaped_msg) if not language_dict_result: return tk.literal(msg) language_dict = language_dict_result.group(0) localized_language_dict = get_localized_value_for_display(language_dict) localized_msg = unescaped_msg.replace(language_dict, localized_language_dict) return tk.literal(localized_msg)
def downloadFile(self, row, code_directory): try: # Extract different parameter from the row contestId = str(row["problem"]["contestId"]) problemIndex = str(row["problem"]["index"]) submissionId = str(row["id"]) problemName = str(row["problem"]["name"]).replace(" ", "-") submissionLanguage = row["programmingLanguage"] contestName = self.getContestName(contestId) # open submission page for the submission Id url = "http://codeforces.com/contest/" + str( contestId) + "/submission/" + str(submissionId) resp = urlopen(url) html = BeautifulSoup(resp, "html.parser") logging.info("Problem: " + str(contestName) + "-" + str(problemName)) # Extract the source code sourceCodeDiv = html.body.find('pre', attrs={'id': 'program-source-text'}) # Convert html entities to human readable format # e.g.: "<" becomes "<", ">" becomes ">" h = HTMLParser() code = h.unescape(str(sourceCodeDiv.text)) # Create folder according to contest name save_directory = code_directory + os.path.sep + str(contestName) if not os.path.exists(save_directory): logging.info("Creating Directory: " + save_directory) os.mkdir(save_directory) # Save the source code in file filename = save_directory + os.path.sep + problemIndex + "-" + problemName + "." + self.getFileExtension( submissionLanguage) logging.info("Saving Source code in file:" + str(os.path.basename(filename))) with open(filename, "w+") as f: f.write(str(code)) except Exception as e: logging.info("Error while downloading file !!!") raise return
async def grabSteamNews(): r = requests.get( 'http://api.steampowered.com/ISteamNews/GetNewsForApp/v0002/?appid=730&count=1&maxlength=300&format=json' ) decoded = json.loads(r.text) htmlparser = HTMLParser() data = decoded["appnews"]["newsitems"][0] values = [ data["url"], data["feedlabel"], data["title"], htmlparser.unescape(data["contents"]), data["gid"] ] return (values) #print(grabSteamNews())
def get_json_row(cacheid): """ return a row as json """ cacheid = cacheid.upper() conn = mysqlite.check_db() g_c = get_row(conn, cacheid) close_db(conn) if g_c is None: return "{}" html = HTMLParser() g_c.cachename = html.unescape(g_c.cachename) g_c.dltime = int(time.time()) - int(g_c.dltime) g_c.body = htmlcode.remove_all_attrs_except(g_c.body) return str(g_c)
def html_xml(): s = 'Elements are written as "<tag>text</tag>".' print(s) print(html.escape(s)) # Disable escaping of quotes print(html.escape(s, quote=False)) s = 'Spicy Jalapeño' print(s.encode('ascii', errors='xmlcharrefreplace')) s = 'Spicy "Jalapeño".' p = HTMLParser() print(p.unescape(s)) t = 'The prompt is >>>' print(unescape(t))
def main(): s = 'Elements are written as "<tag>text</tag>".' print(s) print(html.escape(s)) print("Disable quotes escape") print(html.escape(s, quote=False)) print("Displaying non-asc text as asc") s = 'Spicy Jalapeño' print(s.encode('ascii', errors='xmlcharrefreplace')) s = 'Spicy "Jalapeño".' p = HTMLParser() print(p.unescape(s)) t = 'The prompt is >>>' print(unescape(t))
def save_videojs(self, data, suffix=''): """ The saving handler. """ i18n_ = self.runtime.service(self, "i18n").ugettext self.display_name = data['display_name'] self.url = data['url'].strip() for language in self.languages.keys(): subtitle_text = data['subtitle_text_' + language].strip() if subtitle_text: reader = detect_format(subtitle_text) if reader: try: subtitle = WebVTTWriter().write( reader().read(subtitle_text)) except: return Response(json.dumps({ 'error': i18n_( "Error occurred while saving VTT subtitles for language %s" ) % language.upper() }), status=400, content_type='application/json', charset='utf8') h = HTMLParser() self.subtitles[language] = h.unescape(subtitle) self.create_subtitles_file(self.subtitles[language]) else: return Response(json.dumps({ 'error': i18n_( "Error occurred while saving VTT subtitles for language %s" ) % language.upper() }), status=400, content_type='application/json', charset='utf8') else: self.subtitles[language] = "" return {'result': 'success'}
def parse_p4p(response): resp_json = response.json() total_page = resp_json['totalPage'] current_page = resp_json['currentPage'] next_page = current_page + 1 if current_page < total_page else None p4ps = list() data = resp_json['data'] parser = HTMLParser() for item in data: p4ps.append(P4P( keyword=parser.unescape(item['keyword']), qs_star=item['qsStar'], is_start=(item['state']=="1"), tag=item['tag'], )) return next_page, p4ps
def DecodeQuery2(fileName1, fileName2): data1 = [x.strip() for x in open(fileName1, "r").readlines()] data2 = [x.strip() for x in open(fileName2, "r").readlines()] query_list = [] time_list = [] for item1, item2 in zip(data1, data2): item1 = item1.lower() if len(item1) > 50 or len(item1) < 5: continue h = HTMLParser() item1 = h.unescape(item1) #将>或者 这种转义字符转回去 item1 = parse.unquote(item1) #解码,就是把字符串转成gbk编码,然后把\x替换成%。如果 item1, number = re.subn(r'\d+', "8", item1) #正则表达式替换 item1, number = re.subn(r'(http|https)://[a-zA-Z0-9\.@&/#!#\?:]+', "http://u", item1) query_list.append(item1) time_list.append(item2) return query_list, time_list
def decode_hash(hashed): """Use this function, Luke.""" email = '' prefix = '0x' + hashed[0:2] slicer = 2 while True: try: email += '&#' + \ ('0' + hex(int( '0x' + str( int(hashed[slicer:slicer + 2], 16) ^ int(prefix, 16)), 16)) )[3:] slicer += 2 except ValueError: parser = HTMLParser() return parser.unescape(result)
def remove_twitter_syntax(text): """ Removes special words or syntax from Twitter, including RT, users, urls :param text: tweet to be cleaned :return: str with the clean tweet """ html_parser = HTMLParser() text = html_parser.unescape(text) text = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "", text, flags=FLAGS) # remove urls text = re.sub(r"@\w+", "", text, flags=FLAGS) # remove users text = re.sub(r"<user>", "", text, flags=FLAGS) text = re.sub(r"<url>", "", text, flags=FLAGS) text = re.sub('RT[\s]+', "", text, flags=FLAGS) # removes RT text = re.sub('rt[\s]+', "", text, flags=FLAGS) # removes RT text = re.sub(' +', ' ', text, flags=FLAGS) return text.strip()
def transform_old_subtitle_to_new_form_if_exist(self, modify=False): """ It is possible to modify self.subtitles only in studio_view and save_videojs functions, so in studio_view we only dynamically modify self.subtitles dict, but in studio_view pernamently. """ i18n_ = self.runtime.service(self, "i18n").ugettext subtitles = copy.deepcopy(self.subtitles) if (not 'pl' in self.subtitles ) and self.subtitle_url and self.subtitle_text: reader = detect_format(self.subtitle_text) if reader: try: subtitle = WebVTTWriter().write(reader().read( self.subtitle_text)) except: return Response(json.dumps({ 'error': i18n_( "Error occurred while saving VTT subtitles for language PL" ) }), status=400, content_type='application/json', charset='utf8') h = HTMLParser() subtitles['pl'] = h.unescape(subtitle) if modify: self.subtitles = subtitles self.subtitle_url = self.subtitle_text = "" else: return subtitles else: return Response(json.dumps({ 'error': i18n_( "Error occurred while saving VTT subtitles for language PL" ) }), status=400, content_type='application/json', charset='utf8') else: return subtitles
def query(self, qry): params = { "action": "feedcontributions", "user": qry.encode('raw_unicode_escape').decode("ascii", errors='replace') } if self.opts['days_limit'] != "0": dt = datetime.datetime.now() - datetime.timedelta( days=int(self.opts['days_limit'])) params["year"] = dt.strftime("%Y") params["month"] = dt.strftime("%m") res = self.sf.fetchUrl( f"https://en.wikipedia.org/w/api.php?{urllib.parse.urlencode(params)}", timeout=self.opts['_fetchtimeout'], useragent="SpiderFoot") if res['code'] in ["404", "403", "500"]: self.debug( f"Unexpected response code {res['code']} from Wikipedia") return None if not res['content']: return None links = list() try: parser = HTMLParser() for line in res['content'].split("\n"): matches = re.findall("<link>(.*?)</link>", line, re.IGNORECASE) for m in matches: if "Special:Contributions" in m: continue d = parser.unescape(m) links.append(d) return set(links) except Exception as e: self.error(f"Error processing response from Wikipedia: {e}") return None
def correct(self, text): # grab html html = self.get_page('http://www.google.com/search?hl=en&q=' + urllib.parse.quote(text) + "&meta=&gws_rd=ssl") html_parser = HTMLParser() # save html for debugging # open('page.html', 'w').write(html) # pull pieces out match = re.search(r'(?:Showing results for|Did you mean|Including results for).*?<a.*?>(.*?)</a>', html) if match is None: fix = text else: fix = match.group(1) fix = re.sub(r'<.*?>', '', fix) fix = html_parser.unescape(fix) # return result return fix