def shortcut_to_server(contents): url = "http://65.215.1.20/faces/add_shortcut.py?" url += urllib.urlencode([("contents",contents)]) f=tools.my_url_open(url) r = f.read().strip() assert server_to_shortcut(r).strip().splitlines()==contents.strip().splitlines() return r
def shortcut_to_server(contents): url = "http://65.215.1.20/faces/add_shortcut.py?" url += urllib.urlencode([("contents", contents)]) f = tools.my_url_open(url) r = f.read().strip() assert server_to_shortcut( r).strip().splitlines() == contents.strip().splitlines() return r
def get_flower_urls(): urlstring = "http://flower-dictionary.com/list.php?page=" lst = [] for i in range(19): html = tools.my_url_open(urlstring + str(i)).read() lst += re.findall('href=\\".*?.htm\\"', html) return lst
def get_flower_urls(): urlstring = "http://flower-dictionary.com/list.php?page=" lst = [] for i in range(19): html = tools.my_url_open(urlstring+str(i)).read() lst+= re.findall('href=\\".*?.htm\\"',html) return lst
import tools, time from PIL import Image import re from urllib import urlretrieve urlstring = "http://www.lifeprint.com/asl101/gifs-animated/" html = tools.my_url_open(urlstring).read() lst = re.findall('href.*</a>', html) lst2 = [i[6:i.rindex('"')] for i in lst] lst3 = [i[i.rindex('"') + 2:-4] for i in lst] #for i in range(len(lst2)): # urlretrieve("http://www.lifeprint.com/asl101/gifs-animated/"+lst2[i],"c:/data/signs/"+lst3[i]+".gif")
def server_to_shortcut(label="test"): url = "http://65.215.1.20/faces/add_shortcut.py?" + urllib.urlencode( [("name", label)]) f = tools.my_url_open(url) r = f.read().strip() return r
urlstring = "http://www.amazon.com/gp/search/ref=sr_hi_4?rh=n%3A228013%2Cn%3A%21468240%2Cn%3A551240%2Cn%3A13397641%2Cn%3A324030011&bbn=324030011&ie=UTF8&qid=1286030645#/ref=sr_pg_2?rh=n%3A228013%2Cn%3A%21468240%2Cn%3A551240%2Cn%3A13397641%2Cn%3A324030011%2Cp_6%3AA2PHD1PQGLN1MQ&page=pG&bbn=324030011&sort=pmrank&ie=UTF8&qid=1286031509" s="" for i in range(163): s+=str(i)+"_" print s exit(0) imgs2 = [] for p in range(2,4): url = urlstring.replace("pG",str(p)) print url html = tools.my_url_open(url).read() #print html imgs = html.split("AA160_.jpg") for im in imgs[:-1]: if im.rfind("http:")!=-1: imgs2.append(im[im.rfind("http:"):]+"AA160_.jpg") st = "<html>" for i in imgs2: st+="<img src='"+i+"'>\n " tools.my_write('c:/temp/tiles/massive2.html',st) print "Page ",p,"got ",len(imgs2),"images" time.sleep(2)
import tools, time from PIL import Image urlstring = "http://www.amazon.com/s/qid=1285958507/ref=sr_pg_pG?ie=UTF8&sort=-price&keywords=ties&bbn=1036592&rh=k%3Aties%2Cn%3A1036592%2Cn%3A%211036682%2Cp_6%3AA32FQKBE4XLKI7&page=pG" a=tools.SimpleDisplay() imgs2 = [] for p in range(66): html = tools.my_url_open(urlstring.replace("pG",str(p))).read() imgs = html.split("190,246_.jpg") for im in imgs[:-1]: if im.rfind("http:")!=-1: imgs2.append(im[im.rfind("http:"):]+"190,246_.jpg") st = "<html>" for i in imgs2: st+="<img src='"+i+"'>\n " tools.my_write('c:/temp/ties/massive2.html',st) print "Page ",p,"got ",len(imgs2),"images" time.sleep(2)
from PIL import Image urlstring = "http://www.amazon.com/gp/search/ref=sr_hi_4?rh=n%3A228013%2Cn%3A%21468240%2Cn%3A551240%2Cn%3A13397641%2Cn%3A324030011&bbn=324030011&ie=UTF8&qid=1286030645#/ref=sr_pg_2?rh=n%3A228013%2Cn%3A%21468240%2Cn%3A551240%2Cn%3A13397641%2Cn%3A324030011%2Cp_6%3AA2PHD1PQGLN1MQ&page=pG&bbn=324030011&sort=pmrank&ie=UTF8&qid=1286031509" s = "" for i in range(163): s += str(i) + "_" print s exit(0) imgs2 = [] for p in range(2, 4): url = urlstring.replace("pG", str(p)) print url html = tools.my_url_open(url).read() #print html imgs = html.split("AA160_.jpg") for im in imgs[:-1]: if im.rfind("http:") != -1: imgs2.append(im[im.rfind("http:"):] + "AA160_.jpg") st = "<html>" for i in imgs2: st += "<img src='" + i + "'>\n " tools.my_write('c:/temp/tiles/massive2.html', st) print "Page ", p, "got ", len(imgs2), "images" time.sleep(2)
def get_gogh_urls(): urlstring = "http://www.vangoghgallery.com/catalog/Painting/" html = tools.my_url_open(urlstring).read() lst = re.findall('catalog/Painting/.*?html', html) return lst
def get_gogh_urls(): urlstring = "http://www.vangoghgallery.com/catalog/Painting/" html = tools.my_url_open(urlstring).read() lst = re.findall('catalog/Painting/.*?html', html) return lst lst = tools.uniqueify(get_gogh_urls()) lst2 = [i[17:] for i in lst] ref_urls = [ "http://www.vangoghgallery.com/catalog/Painting/" + i for i in lst2 ] img_urls = [] for i in ref_urls: html = tools.my_url_open(i).read() s = re.findall("image/.*?/.*?.jpg", html) assert len(s) == 2 and s[0] == s[1] print i img_urls.append("http://www.vangoghgallery.com/catalog/" + s[0]) #lst2 = [i[6:-1] for i in lst] #lst4 = [] #for i in lst2: # html = tools.my_url_open("http://flower-dictionary.com/"+i).read() # lst3=re.findall("/uploads/flowers/.*?.jpg",html) # if len(lst3)!=1: # print "SHOOT:",i,len(lst3) # lst4+=lst3 #lst2b = [i[:-4] for i in lst2] #lst5 = [i[17:] for i in lst4]
def server_to_shortcut(label="test"): url="http://65.215.1.20/faces/add_shortcut.py?"+urllib.urlencode([("name",label)]) f=tools.my_url_open(url) r = f.read().strip() return r
def get_gogh_urls(): urlstring = "http://www.vangoghgallery.com/catalog/Painting/" html = tools.my_url_open(urlstring).read() lst= re.findall('catalog/Painting/.*?html',html) return lst
import re from urllib import urlretrieve def get_gogh_urls(): urlstring = "http://www.vangoghgallery.com/catalog/Painting/" html = tools.my_url_open(urlstring).read() lst= re.findall('catalog/Painting/.*?html',html) return lst lst=tools.uniqueify(get_gogh_urls()) lst2=[i[17:] for i in lst] ref_urls = ["http://www.vangoghgallery.com/catalog/Painting/"+i for i in lst2] img_urls = [] for i in ref_urls: html = tools.my_url_open(i).read() s = re.findall("image/.*?/.*?.jpg",html) assert len(s)==2 and s[0]==s[1] print i img_urls.append("http://www.vangoghgallery.com/catalog/"+s[0]) #lst2 = [i[6:-1] for i in lst] #lst4 = [] #for i in lst2: # html = tools.my_url_open("http://flower-dictionary.com/"+i).read() # lst3=re.findall("/uploads/flowers/.*?.jpg",html) # if len(lst3)!=1: # print "SHOOT:",i,len(lst3) # lst4+=lst3
from urllib import urlretrieve def get_flower_urls(): urlstring = "http://flower-dictionary.com/list.php?page=" lst = [] for i in range(19): html = tools.my_url_open(urlstring+str(i)).read() lst+= re.findall('href=\\".*?.htm\\"',html) return lst lst=get_flower_urls() lst2 = [i[6:-1] for i in lst] lst4 = [] for i in lst2: html = tools.my_url_open("http://flower-dictionary.com/"+i).read() lst3=re.findall("/uploads/flowers/.*?.jpg",html) if len(lst3)!=1: print "SHOOT:",i,len(lst3) lst4+=lst3 lst2b = [i[:-4] for i in lst2] lst5 = [i[17:] for i in lst4] n = len(lst5) names = [] nums = [] for i in range(n): if lst5[i] not in lst5[:i]: nums.append(lst5[i])
bbb = aaa+nxt[aaa:].find("</span>") if bbb==-1: print "YO7" continue res['att_price'] = float(nxt[aaa:bbb].replace(",","")) results.append(res) except: pass return results for p in range(326,328): allresults = [] url = urlstring.replace("pG",str(p)) #print url html = tools.my_url_open(url).read() allresults=scrape_amazon_page(html) print "results page",p, len(allresults), len(allresults)/24., len(tools.uniqueify(allresults)) i=0 ar2 = [] for r in allresults: i=i+1 imfname = "c:/temp/tiles2/"+r['ASIN']+".jpg" if not os.path.exists(imfname): try: print "Scraping object",i,"page",p,(p-1)*24+i html = tools.my_url_open(r['url']).read() r['att_glass']=1 if 'Glass Tiles</a>' in html else 0 r['att_ceramic']=1 if 'Ceramic Tiles</a>' in html else 0 r['att_stone']=1 if 'Stone Tiles</a>' in html else 0
bbb = aaa + nxt[aaa:].find("</span>") if bbb == -1: print "YO7" continue res['att_price'] = float(nxt[aaa:bbb].replace(",", "")) results.append(res) except: pass return results for p in range(326, 328): allresults = [] url = urlstring.replace("pG", str(p)) #print url html = tools.my_url_open(url).read() allresults = scrape_amazon_page(html) print "results page", p, len(allresults), len(allresults) / 24., len( tools.uniqueify(allresults)) i = 0 ar2 = [] for r in allresults: i = i + 1 imfname = "c:/temp/tiles2/" + r['ASIN'] + ".jpg" if not os.path.exists(imfname): try: print "Scraping object", i, "page", p, (p - 1) * 24 + i html = tools.my_url_open(r['url']).read() r['att_glass'] = 1 if 'Glass Tiles</a>' in html else 0 r['att_ceramic'] = 1 if 'Ceramic Tiles</a>' in html else 0
import tools, time from PIL import Image import re from urllib import urlretrieve urlstring = "http://www.lifeprint.com/asl101/gifs-animated/" html = tools.my_url_open(urlstring).read() lst= re.findall('href.*</a>',html) lst2 = [i[6:i.rindex('"')] for i in lst] lst3 = [i[i.rindex('"')+2:-4] for i in lst] #for i in range(len(lst2)): # urlretrieve("http://www.lifeprint.com/asl101/gifs-animated/"+lst2[i],"c:/data/signs/"+lst3[i]+".gif")
def get_flower_urls(): urlstring = "http://flower-dictionary.com/list.php?page=" lst = [] for i in range(19): html = tools.my_url_open(urlstring + str(i)).read() lst += re.findall('href=\\".*?.htm\\"', html) return lst lst = get_flower_urls() lst2 = [i[6:-1] for i in lst] lst4 = [] for i in lst2: html = tools.my_url_open("http://flower-dictionary.com/" + i).read() lst3 = re.findall("/uploads/flowers/.*?.jpg", html) if len(lst3) != 1: print "SHOOT:", i, len(lst3) lst4 += lst3 lst2b = [i[:-4] for i in lst2] lst5 = [i[17:] for i in lst4] n = len(lst5) names = [] nums = [] for i in range(n): if lst5[i] not in lst5[:i]: nums.append(lst5[i])