def get(self, *args): query = base.collapse(urllib.unquote(args[1])) if not query: return self.ok("Please provide two Wikipedia article titles.") query = StringIO.StringIO(query) try: query_tokens = csv.reader(query, delimiter=" ").next() except: return self.ok("Please use proper quoting for arguments.") try: from_name = query_tokens[0] or "" except: from_name = "" if not from_name: return self.ok("Please name a starting Wikipedia article title.") try: to_name = query_tokens[1] or "" except: to_name = "" if not to_name: return self.ok("Please name an ending Wikipedia article title.") query = urllib.urlencode({"from": from_name, "to": to_name}) uri = API_URI + "?" + query try: html = api.urlfetch.fetch(uri).content html = unescape.unescape(html.decode("latin1")) tree = BeautifulSoup.BeautifulSoup(html) except Exception, error: return self.ok("Timeout fetching Wikipedia distance.")
def get(self, *args): query = base.collapse(urllib.unquote(args[1])) query = urllib.urlencode({"key": query, "type": "Books", "page":"1"}) uri = API_URI + "?" + query try: html = api.urlfetch.fetch(uri).content html = unescape.unescape(html.decode("latin1")) tree = BeautifulSoup.BeautifulSoup(html) except Exception, error: return self.ok("Timeout fetching ISBN information.")
def get(self, *args): query = base.collapse(urllib.unquote(args[1])) query = urllib.urlencode({"key": query, "type": "Books", "page": "1"}) uri = API_URI + "?" + query try: html = api.urlfetch.fetch(uri).content html = unescape.unescape(html.decode("latin1")) tree = BeautifulSoup.BeautifulSoup(html) except Exception, error: return self.ok("Timeout fetching ISBN information.")
def get(self, *args): word = args[1] if not word: self.ok("Please provide a word.") word = urllib.unquote(word) payload = urllib.urlencode({"q": word}) try: headers = {"Content-Type": "application/x-www-form-urlencoded"} html = api.urlfetch.fetch(API_URI, method=api.urlfetch.POST, payload=payload, headers=headers).content except Exception: return self.ok("Error fetching results.") tree = BeautifulSoup.BeautifulSoup(html) try: message = base.collapse(tree.find("blockquote").string) message = unescape.unescape(message) except: return self.ok("Error parsing results.") return self.ok(message)
def get(self, *args): word = args[1] if not word: self.ok("Please provide a word.") word = urllib.unquote(word) payload = urllib.urlencode({"q": word}) try: headers = {"Content-Type": "application/x-www-form-urlencoded"} html = api.urlfetch.fetch( API_URI, method=api.urlfetch.POST, payload=payload, headers=headers).content except Exception: return self.ok("Error fetching results.") tree = BeautifulSoup.BeautifulSoup(html) try: message = base.collapse(tree.find("blockquote").string) message = unescape.unescape(message) except: return self.ok("Error parsing results.") return self.ok(message)
def _repopulate(self): print "Regenerating DOM" self.dom = xml.dom.minidom.parseString(urllib2.urlopen(self.url).read()) print "Repopulating items array" self.index = 0 # this monster achieves: # - slice the first description node off (it's a channel desc) # - crop each string to a max of 300 chars # - URI decode text # - unescape HTML entities self.items = map( lambda x: unescape( urllib2.unquote( x.firstChild.data))[0:300], \ self.dom.getElementsByTagName("description")[1:] ) # No unicode support in puredata (at least not via pyext. It claims it can't convert) print "Filtering unicode characters..." def maybe_delete(c): try: return str(c) except: return " " self.items = map(lambda x: ''.join(x), map(lambda y: map(maybe_delete, y), self.items)) print "Items now has %s entries" % len(self.items)
def get_all_text(s): t = s.findAll(text=True) t = unescape(' '.join(t)) t = ' '.join(t.split()) return t
#python3 import lxml.html filename = 'petrol.html' with open(filename, 'r') as f: text = f.read() html = lxml.html.fromstring(text) newtext = lxml.html.tostring(html) newfile = 'petrol2.html' with open(newfile, 'wb') as f: f.write(newtext) #https://stackoverflow.com/questions/9487133/python-convert-html-ascii-encoded-text-to-utf8 import unescape new2 = unescape.unescape(text) newfile2 = 'petrol_unescape.html' with open(newfile2, 'wt') as f: f.write(new2) #http://effbot.org/zone/re-sub.htm#unescape-html
def clean(text, hasDebugFlag=False): """ Transforms wiki markup. @see https://www.mediawiki.org/wiki/Help:Formatting """ # Drop transclusions (template, parser functions) text = dropNested(text, r'{{', r'}}') # Drop tables text = dropNested(text, r'{\|', r'\|}') # Remove any found signatures and timestamps text = removeSignature(text) # replace external links text = replaceExternalLinks(text) # replace internal links text = replaceInternalLinks(text) # drop MagicWords behavioral switches text = magicWordsRE.sub('', text) ################ Process HTML ############### # turn into HTML, except for the content of <syntaxhighlight> res = '' cur = 0 for m in syntaxhighlight.finditer(text): end = m.end() res += unescape(text[cur:m.start()]) + m.group(1) cur = end text = res + unescape(text[cur:]) # Handle bold/italic/quote text = bold_italic.sub(r'\1', text) text = bold.sub(r'\1', text) text = italic_quote.sub(r'"\1"', text) text = italic.sub(r'"\1"', text) text = quote_quote.sub(r'"\1"', text) # residuals of unbalanced quotes text = text.replace("'''", '').replace("''", '"') # Collect spans spans = [] # Drop HTML comments for m in comment.finditer(text): spans.append((m.start(), m.end())) # Drop self-closing tags for pattern in selfClosing_tag_patterns: for m in pattern.finditer(text): spans.append((m.start(), m.end())) # Drop ignored tags for left, right in ignored_tag_patterns: for m in left.finditer(text): spans.append((m.start(), m.end())) for m in right.finditer(text): spans.append((m.start(), m.end())) # Bulk remove all spans text = dropSpans(spans, text) # Drop discarded elements for tag in discardElements: text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag) # Turn into text what is left (&nbsp;) and <syntaxhighlight> text = unescape(text) # Expand placeholders for pattern, placeholder in placeholder_tag_patterns: index = 1 for match in pattern.finditer(text): text = text.replace(match.group(), '%s_%d' % (placeholder, index)) index += 1 text = text.replace('<<', '«').replace('>>', '»') ############################################# # Cleanup text text = text.replace('\t', ' ') text = re.sub(' (,:\.\)\]»)', r'\1', text) text = re.sub('(\[\(«) ', r'\1', text) text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations # Remove lists, tables and such text = compact(text) # Remove symbols and reduce multiple successive spaces to one text = removeSymbols(text) text = spaces.sub(' ', text) if hasDebugFlag: print(text) return text