def say_with_google(word, autoremove=True, background=False, debug=False): """ Say a word with Google. https://ubuntuincident.wordpress.com/2012/03/27/audio-pronunciation-of-words-from-google/ The return value is a tuple: (found, mp3_file), where found is True if the word was retrieved successfully (False otherwise), and mp3_file is the path of the locally saved mp3 (or None if it was not saved). Set autoremove to False if you want to work with the mp3 later, when this function returned. The function stores the mp3 files in /tmp. """ found = False # Was the mp3 successfully found? mp3_file = None # Is the locally saved mp3 file kept? url = template.format(word=word) content = web.get_page(url, user_agent=True) if content: found = True fname = '/tmp/{word}.mp3'.format(word=word) fs.store_content_in_file(content, fname, overwrite=True) mp3_file = fname if not debug: play(fname, background=background) if autoremove: os.unlink(fname) mp3_file = None else: found = False mp3_file = None return (found, mp3_file)
def get_my_external_ip(): """ Get my external IP. Local IP: http://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib """ return get_page('http://ifconfig.me/ip')
def say_with_google(word, autoremove=True, background=False, debug=False): """ Say a word with Google. https://ubuntuincident.wordpress.com/2012/03/27/audio-pronunciation-of-words-from-google/ The return value is a tuple: (found, mp3_file), where found is True if the word was retrieved successfully (False otherwise), and mp3_file is the path of the locally saved mp3 (or None if it was not saved). Set autoremove to False if you want to work with the mp3 later, when this function returned. The function stores the mp3 files in /tmp. """ found = False # Was the mp3 successfully found? mp3_file = None # Is the locally saved mp3 file kept? url = template.format(word=word) content = web.get_page(url, user_agent=True) if content: found = True fname = "/tmp/{word}.mp3".format(word=word) fs.store_content_in_file(content, fname, overwrite=True) mp3_file = fname if not debug: play(fname, background=background) if autoremove: os.unlink(fname) mp3_file = None else: found = False mp3_file = None return (found, mp3_file)
def extract_image_urls(url): origin = None li = [] text = get_page(url, user_agent=True, referer=True) soup = bs.to_soup(text) # this version worked for a day: # for pic in soup.findCssSelect('div.pic'): # a = pic.find('a', href=True) # if a: # li.append(a['href']) # here is a new version, updated to the changes for div in soup.findCssSelect("div.pic"): img = div.find("img") if img and img.has_key("src"): li.append(img["src"].replace("/small/", "/large/")) for div in soup.findCssSelect("html body form#aspnetForm div#main div"): result = re.search(r"URL: (http://.*)View full images", div.text) if result: origin = result.group(1) return origin, li
def visit(blog, dic): url = 'http://{name}.wordpress.com'.format(name=blog) text = get_page(url) soup = bs.to_soup(text) hits = soup.findCssSelect('div#blog-stats ul li')[0].text hits = int(hits.replace('hits','').replace(',','').strip()) # dic[url] = hits
def setup_module(module): """runs just once per module""" global GOOGLE_HTML GOOGLE_HTML = web.get_page(GOOGLE) try: os.unlink(cfg.TEST_TMP_FILE) except: pass # maybe it didn't exist
def get_image_url_list(url): """Controller function for getting the URLs of the JPG images.""" text = get_page(url) doc = lx.to_doc(text) subpages = get_subpages(doc) images = extract_images_from_pages(subpages) return images
def main(url): html = web.get_page(url, user_agent=True) txt = web.html_to_text(html, method=web.HTML2TEXT) #txt = ascii.unicode_to_ascii(txt) #txt = txt.replace(u'\xb7', '-') #txt = ascii.remove_non_ascii(txt).encode('ascii') print_result(txt)
def extract_images_from_pages(pages): """Extract images from subpages.""" li = [] for page in pages: doc = lx.to_doc(get_page(page)) image = get_jpg_image(doc) li.append(image) return [x for x in li if x] # remove None elems
def process(word): """Process the given word. The return value is a tuple: (word, hyphenation, pronunciation mp3).""" url = _template.format(word=word) html = web.get_page(url, user_agent=True) doc = lx.to_doc(html) return (word, get_hyphen(doc), get_mp3(doc))
def extract(): text = web.get_page(URL) text = text.split('g_img={url:')[1] text = text.split(',')[0].replace("'", "") img_url = urljoin(URL, text) fname = img_url.split('/')[-1] img_ext = os.path.splitext(fname)[1] save_name = get_date_from_year_to_day() + img_ext return (img_url, save_name)
def process(url): text = get_page(url, user_agent=True) doc = lx.to_doc(text) #lx.show_paths(doc, find='Montreal, Quebec') tag = doc.cssselect('h1#locationName.brTopLeft5')[0] city = tag.text print city tag = doc.cssselect('div#tempActual span.pwsrt span.nobr')[0] celsius = tag.text_content() print celsius
def demo8(): url = "http://python.org/" text = get_page(url) # doc = lx.to_doc(text, parser=scraper.HTML5PARSER) # doc = lx.to_doc(text) doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP) # print type(doc) # print etree.tostring(doc) title = doc.cssselect("html head title")[0] print title.text
def extract(test=False): text = web.get_page(URL) text = text.split('g_img={url:')[1] text = text.split(',')[0].replace("'", "") img_url = urljoin(URL, text) fname = img_url.split('/')[-1] fname = unquote(fname).split('/')[-1] if not test: print '# fname:', fname save_name = '{date}-{fname}'.format(date=get_date_from_year_to_day(), fname=fname) return (img_url, save_name)
def process(word): url = _template.format(word=word) html = web.get_page(url, user_agent=True) txt = web.html_to_text(html).decode('utf-8') #txt = ascii.unicode_to_ascii(txt) txt = txt.replace(u'\xb7', '-') txt = ascii.remove_non_ascii(txt).encode('ascii') txt = re.sub('\[.*?.gif\]', '', txt) print_result(txt)
def process(word): url = _template.format(word=word) html = web.get_page(url, user_agent=True) txt = web.html_to_text(html).decode("utf-8") # txt = ascii.unicode_to_ascii(txt) txt = txt.replace(u"\xb7", "-") txt = ascii.remove_non_ascii(txt).encode("ascii") txt = re.sub("\[.*?.gif\]", "", txt) print_result(txt)
def get_slogan(word, times=1): assert 1 <= times <= 10 # be nice with the server # li = [] url = BASE + urllib.urlencode({'user': word}) for _ in xrange(times): text = get_page(url, user_agent=True) soup = bs.to_soup(text) slogan = soup.findCssSelect('html body div p')[0].text if string.count(slogan, '.') == 1 and not slogan[0].isupper(): slogan = slogan.replace('.', '') if len(slogan) >= 2 and slogan[-1] == '.' and slogan[-2] == '!': slogan = slogan[:-1] li.append(slogan) return li
def get_slogan(word, times=1): assert 1 <= times <= 10 # be nice with the server # li = [] url = BASE + urllib.urlencode({'user' : word}) for _ in xrange(times): text = get_page(url, user_agent=True) soup = bs.to_soup(text) slogan = soup.findCssSelect('html body div p')[0].text if string.count(slogan, '.') == 1 and not slogan[0].isupper(): slogan = slogan.replace('.', '') if len(slogan) >= 2 and slogan[-1] == '.' and slogan[-2] == '!': slogan = slogan[:-1] li.append(slogan) return li
def definitions(word): """ Fetch the definition of the word. """ template = 'http://api.wordnik.com//v4/word.json/{word}/definitions?includeRelated=false&includeTags=false&useCanonical=false' url = prepare_url(template, word) try: decoded = json.loads(get_page(url)) #print json.dumps(decoded) # partOfSpeech = decoded[0]['partOfSpeech'] text = decoded[0]['text'] d = {} d['partOfSpeech'] = partOfSpeech d['text'] = text return d except: return None
def download(self, warning=True): """Download yourself.""" if os.path.exists(self.get_skip_path()): return False # else if not self.exists(): if self.make_dirs(): obj = web.get_page(self.file_url, user_agent=True, referer=True) fs.store_content_in_file(obj, self.get_local_path()) ok = self.exists() if not ok and warning: print >>sys.stderr, "# warning: couldn't download {url}.".format(url=self.file_url) if self.readme: self.save_readme() return ok
def check(): """Just an example how to use the BS4 library.""" text = str(get_page(URL)) soup = bs.to_soup(text, PARSER) book = soup.find('div', {'class' : 'module bookSmall'}) link = book.find('a', href=True) print link['href'] # book = soup.find('div', {'class' : 'module fullBook'}) try: title = book.find('span', {'property': 'dc:title'}).text.lower() except: title = "" print title tabs = soup.find_all('div', {'class' : 'tabModules'})[-1] try: desc = tabs.find('p', {'class' : 'paragraph'}).text.lower() except: desc = "" print desc
def examples(word, limit=None): """ Fetch examples. """ template = 'http://api.wordnik.com//v4/word.json/{word}/examples' url = prepare_url(template, word) #print url try: decoded = json.loads(get_page(url)) #print json.dumps(decoded) li = [] array = decoded['examples'] # no limit, everything if limit: # if limit specified array = array[:limit] for e in array: li.append(e['text']) # return li except: return None
def extract_list(): """ Extract proxy list from base url. """ sys.stdout.write('# extracting list') proxies = [] text = get_page(BASE, user_agent=True) soup = bs.to_soup(text) proxylist = soup.findCssSelect('table.proxylist')[0] for tr in proxylist.findAll('tr', {'class': True}): if tr['class'] in ('odd', 'even'): cols = tr.findAll('td') ip = cols[0].text type = cols[1].text country = cols[2].text proxies.append(Proxy(ip, type, country)) sys.stdout.write('.') # print 'done.' return proxies
def download(self, warning=True): """Download yourself.""" if os.path.exists(self.get_skip_path()): return False # else if not self.exists(): if self.make_dirs(): obj = web.get_page(self.file_url, user_agent=True, referer=True) fs.store_content_in_file(obj, self.get_local_path()) ok = self.exists() if not ok and warning: print >> sys.stderr, "# warning: couldn't download {url}.".format( url=self.file_url) if self.readme: self.save_readme() return ok
def is_internet_on(method=1): """Check if the Internet connection is on.""" if method == 1: # At my current place we have a wifi that redirects to a login page, # so we always have a connection. That's why I check the content of # the fetched webpage. text = web.get_page(URL, timeout=3) if text: if '<title>Google</title>' in text: return True # else: return False elif method == 2: # http://stackoverflow.com/questions/3764291/checking-network-connection try: urllib2.urlopen('http://www.google.com', timeout=1) return True except urllib2.URLError: return False else: print '# warning: unknown method in is_internet_on()'
def main(): text = web.get_page(URL) soup = BeautifulSoup(text) countries = Countries() for row in soup.findAll('tr'): cols = row.findAll('td') if cols: rank = cols[0].text if rank and re.search('^\d+$', rank): country = cols[1].find('a', title=True).text population = int(cols[2].text.replace(',', '')) #print country,':',population countries.add(country, population) #countries.sort() d = {} for country in countries.countries: d[country.name] = country.population print json.dumps(d)
def get_info(self): text = get_page(self.url) return json.loads(text)
assert base_url is not None # for tag in soup.findAll('a', href=True): tag['href'] = urlparse.urljoin(base_url, tag['href']) return soup # The patch is applied automatically when this module is imported. css_patch() ############################################################################# if __name__ == "__main__": url = "http://index.hu" text = web.get_page(url) soup = to_soup(text) print prettify(soup) # LINKS = """ <html> <head> <title>retrogames.com</title> </head> <a href="http://retrogames.com">Retro Games HQ</a> <a href="/games/elite">Elite</a> <a href="/games/commando">Commando</a> </html> """
Demo for lx.py. Download population of countries. """ import re from jabbapylib.web.scraper import lx from jabbapylib.web.web import get_page def process(doc): data = {} for row in doc.cssselect('tr'): cols = row.cssselect('td') if cols: rank = cols[0].text if rank and re.search('^\d+$', rank): country = cols[1].cssselect('a[title]')[0].text population = int(cols[2].text.replace(',', '')) data[country] = population print data ############################################################################# if __name__ == "__main__": url = 'https://secure.wikimedia.org/wikipedia/en/wiki/List_of_countries_by_population' text = get_page(url) doc = lx.to_doc(text) process(doc)
def get_json(self, ip): text = get_page(gp_template.format(ip=ip)) text = re.sub("^geoPlugin\(", "", text) text = re.sub("\)$", "", text) return json.loads(text)
def get_json(self, ip): text = get_page(gp_template.format(ip=ip)) text = re.sub('^geoPlugin\(', '', text) text = re.sub('\)$', '', text) return json.loads(text)
def demo2(): url = "http://projecteuler.net/" text = get_page(url) doc = lx.to_doc(text) lx.make_links_absolute(doc, base_url=url) print lx.tostring(doc)
def demo9(): url = "http://python.org/" text = get_page(url) soup = bs.to_soup(text) title = soup.findCssSelect("html head title")[0] print title.text
def test_store_content_in_file(self): content = web.get_page(GOOGLE) assert not os.path.exists(cfg.TEST_TMP_FILE) fs.store_content_in_file(content, cfg.TEST_TMP_FILE) assert os.path.getsize(cfg.TEST_TMP_FILE) > 0 os.unlink(cfg.TEST_TMP_FILE)