def wieistmeineip(self): result = {} # Save original socket originalSocket = socket.socket # Set TOR Socks proxy commonutils.setTorProxy() try: # Load soup = self.parse("http://www.wieistmeineip.de") location = soup.findAll("div", { "class" : "location" })[0] location = bs(location.text, convertEntities=bs.HTML_ENTITIES) ip = soup.findAll('div', id='ipv4')[0] raw_ip = bs(ip.text, convertEntities=bs.HTML_ENTITIES) pattern = re.compile('[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}') ip = re.search(pattern, raw_ip.text) result["ipaddress"] = ip.group(0) result["country"] = str(location) finally: # Removing SOCKS Tor Proxy socket.socket = originalSocket return result
def get_text_from_paragraphs(self, paragraphs_list, bs_doc): """ Returns a list of elements corresponding to list of words from particular section. """ words_in_paragraph_list = [] i = 0 for paragraph in paragraphs_list: try: pmid_in_bracket = bs(str(bs_doc.findAll(attrs={"pub-id-type": "pmid"})[0])).findAll(text=True) except: pmid_in_bracket = bs(str(bs_doc.findAll(attrs={"pub-id-type": "pmc"})[0])).findAll(text=True) pmid = str(pmid_in_bracket[0].encode("utf-8")) words_in_one_paragraph = [] # print paragraph for text in paragraph.findAll(text=True): words = text.split() encoded = self.encode_list(words) cleaned = self.remove_digits(encoded) lowered = self.lower_words(cleaned) replaced = self.replace_punc(lowered) stems = self.into_stems(replaced) for word in stems: words_in_one_paragraph.append(word) i += 1 words_in_paragraph_list.append((pmid, i, words_in_one_paragraph)) return words_in_paragraph_list
def wieistmeineip(self): result = {} # Save original socket originalSocket = socket.socket # Set TOR Socks proxy commonutils.setTorProxy() try: # Load soup = self.parse("http://www.wieistmeineip.de") location = soup.findAll("div", {"class": "location"})[0] location = bs(location.text, convertEntities=bs.HTML_ENTITIES) ip = soup.findAll('div', id='ipv4')[0] raw_ip = bs(ip.text, convertEntities=bs.HTML_ENTITIES) pattern = re.compile( '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}') ip = re.search(pattern, raw_ip.text) result["ipaddress"] = ip.group(0) result["country"] = str(location) finally: # Removing SOCKS Tor Proxy socket.socket = originalSocket return result
def main(url): """ blogger_image_grab.py Downloads all the images on the supplied Blogger blog, and saves them to the Downloads directory Usage: python blogger_image_grab.py http://example.com """ # send the request with a random user agent in the header request = urllib2.Request(url, None, randomize_user_agent()) html = urllib2.urlopen(request) soup = bs(html) parsed = list(urlparse.urlparse(url)) download_images(soup, parsed) older_posts = soup.find(text='Older Posts') while older_posts: print 'Navigating to the next page: %s' % older_posts.previous['href'] soup = bs(urlopen(older_posts.previous['href'])) parsed = list(urlparse.urlparse(url)) download_images(soup, parsed) older_posts = soup.find(text='Older Posts') if not older_posts: print 'Downloading complete!'
def porterScrape(): """docstring for porterScrape""" storer = {} data = urlopen(jeans).read() b = bs(data) designers = b.findAll('div',{'class':'designer'}) for d in designers: pid = int(d.find('a').get('href')[9:]) brand = d.find('span',{'class':'product-designer'}).text title = d.find('span',{'class':'product-title'}).text newd = dict(brand = brand, title = title) storer[pid] = newd data = urlopen(trousers).read() b = bs(data) designers = b.findAll('div',{'class':'designer'}) for d in designers: pid = int(d.find('a').get('href')[9:]) brand = d.find('span',{'class':'product-designer'}).text title = d.find('span',{'class':'product-title'}).text newd = dict(brand = brand, title = title) storer[pid] = newd return storer
def __init__(self, lnv, fec): self.lnv = wenku8.WENKU8_PREFIX + wenku8.LNV_LIST[lnv] self.fec = fec self.req = urllib2.Request(self.lnv,'', wenku8.OPERA_X_H) self.response = urllib2.urlopen(self.req) # use BeautifulSoup self.soup = bs(self.response) # chapter/volume counter ctr_chp = 0 #ctr_vol = -1 self.chps = list() self.tsp = bs() fl = open('list' + '_' + self.fec + '.txt','w') for atr in self.soup.body.findAll('tr'): tsp = bs(atr.text) if tsp.a != None: ctr_chp += 1 sn = unicode(ctr_chp).rjust(3,'0') lk = tsp.a['href'] title = unicode(tsp.contents[0])[:-1] self.chps.append((sn,lk,title)) fl.write(sn.encode(fec, 'ignore') + '.txt' + ' ' \ + title.encode(fec, 'ignore') + '\n') tsp.close() fl.close() self.soup.close() return
def main(): # Bloomberg has these pages on which all company names are listed on PAGES = ['0-9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'other'] # Initialize collector collectorTitle = ['TICKER', 'NAME', 'EXCHANGE', 'WEBSITE', 'INDUSTRY', 'SECTOR', 'SUMMARY'] execCollectorTitle = ['TICKER', 'NAME', 'EXCHANGE', ('POSITION TITLE', 'EXECUTIVE\'S NAME')] myCSVwriter(CSVNAME, collectorTitle, 1) myCSVwriter(EXECSCSVNAME, execCollectorTitle[0:3] + [elt for tup in execCollectorTitle[3] for elt in tup], 1) for p in PAGES: soup = bs(urllib2.urlopen('http://www.bloomberg.com/markets/companies/a-z/' + p + '/')) # Get remaining pages in the $p category, then loop over scraping those try: rem_pages = [str(i['href']) for i in soup.find('div', {'class': 'dictionary_pagination'}).findAll('a')] except: rem_pages = [] getPageData(soup) # print 'Finished 1st page of ' + p # Collect data on remaining pages of $p for r in rem_pages: getPageData(bs(urllib2.urlopen('http://www.bloomberg.com' + r)))
def classes(text): soup = bs(text) search = soup.body.findAll('td', attrs={'align': 'right'}) for row in search: if row.br: set = str(row) soup = bs(set) grade = soup.td.contents[0] return grade.split('%')[0]
def classes(text): soup = bs(text) search = soup.body.findAll('td',attrs={'align':'right'}) for row in search: if row.br: set = str(row) soup = bs(set) grade = soup.td.contents[0] return grade.split('%')[0]
def __search_results(self, page): start = time.time() if page == 1: results = bs(urlopen(baseURL + queryString + self.searchTerm), parseOnlyThese = ss('a','result_primary_link')) else: results = bs(urlopen(baseURL + queryString + self.searchTerm + searchPageString + str(page)), parseOnlyThese = ss('a','result_primary_link')) for link in results.contents: if link['result-type'] == 'Talk' and not link['href'] in self.listOfPosts: Investigator.__result(self, link['href']) print "__search_results Elapsed Time: %s" % (time.time() - start), self.searchTerm, ' page: ', page
def make_trial_soup(): xml = ''.join(f_data) soup = bs(xml) ssoup = bss(xml) trial_soup = [] #each item of list is BeautifulSoup for i in ssoup('trial'): j = bs(str(i)) trial_soup.append(j) return trial_soup
def content_registry_pretty_message(cls, message): messageBody = '' try: if '<html' in message: messageBody = bs(message).find('body').text elif '<?xml' in message: messageBody = bs(message).find('response').text except: messageBody = message return messageBody
def main(): oldXmlReport = sys.argv[1] newXmlReport = sys.argv[2] oldSoup = bs(file(oldXmlReport).read(), convertEntities=bs.HTML_ENTITIES) newSoup = bs(file(newXmlReport).read(), convertEntities=bs.HTML_ENTITIES) begin = time.time() comparator = create_comparator(oldSoup, newSoup) end = time.time() print '\n# took {0:.2f} secs to build the stats ...\n'.format(end - begin) comparator.show_options()
def main(): oldXmlReport = sys.argv[1] newXmlReport = sys.argv[2] oldSoup = bs(file(oldXmlReport).read(), convertEntities=bs.HTML_ENTITIES) newSoup = bs(file(newXmlReport).read(), convertEntities=bs.HTML_ENTITIES) begin = time.time() comparator = create_comparator(oldSoup, newSoup) end = time.time() print "\n# took {0:.2f} secs to build the stats ...\n".format(end - begin) comparator.show_options()
def schedule(text): soup = bs(text) l = soup.body.table.findAll('td', attrs={'class':'scheduleBody'}) final = [] for row in l: if 'portal' in str(row): if row: sp = bs(str(row)) url = sp.a['href'] name = sp.a.b.contents[0] final.append({'url':url,'name':name}) return final
def schedule(text): soup = bs(text) l = soup.body.table.findAll('td', attrs={'class': 'scheduleBody'}) final = [] for row in l: if 'portal' in str(row): if row: sp = bs(str(row)) url = sp.a['href'] name = sp.a.b.contents[0] final.append({'url': url, 'name': name}) return final
def get_song_list(self): if self.list_of_url: for line in open(self.file): next_url = line.strip() soup = bs(self._get_content(next_url)) self.log.debug( "Q-length : %d, Parsing URL : %s" % (self.command_queue.qsize(), next_url)) yield self.parse_html(soup) else: soup = bs(self._get_content()) yield self.parse_html(soup)
def get_video_from_part_link(part_link): reg='file: "(.+?)"' pattern=re.compile(reg) basic_url='http://nbahd.com/' req = urllib2.Request(url=part_link,headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'}) request=urllib2.urlopen(req) html=request.read() soup=bs(html) try: tag=soup.findAll('div',{'class':'page-content rich-content'})[0] except: tag=soup.findAll('div',{'class':'entry-content rich-content'})[0] tag=tag.findAll('iframe')[0] url=tag['src'] url=basic_url + url request=urllib2.urlopen(url) html=request.read() soup=bs(html) try: video_tag=re.findall(pattern,html) my_addon = xbmcaddon.Addon() HD = my_addon.getSetting('quality') if HD=='false': ind=1 else: ind=0 src=video_tag[ind] except: video_tag=soup.findAll('video')[0] my_addon = xbmcaddon.Addon() HD = my_addon.getSetting('quality') if HD=='false': ind=1 else: ind=0 tag=video_tag.findAll('source')[ind] src=tag['src'] return(src)
def get_video_from_part_link(part_link): reg = 'file: "(.+?)"' pattern = re.compile(reg) basic_url = 'http://nbahd.com/' req = urllib2.Request( url=part_link, headers={ 'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0' }) request = urllib2.urlopen(req) html = request.read() soup = bs(html) try: tag = soup.findAll('div', {'class': 'page-content rich-content'})[0] except: tag = soup.findAll('div', {'class': 'entry-content rich-content'})[0] tag = tag.findAll('iframe')[0] url = tag['src'] url = basic_url + url request = urllib2.urlopen(url) html = request.read() soup = bs(html) try: video_tag = re.findall(pattern, html) my_addon = xbmcaddon.Addon() HD = my_addon.getSetting('quality') if HD == 'false': ind = 1 else: ind = 0 src = video_tag[ind] except: video_tag = soup.findAll('video')[0] my_addon = xbmcaddon.Addon() HD = my_addon.getSetting('quality') if HD == 'false': ind = 1 else: ind = 0 tag = video_tag.findAll('source')[ind] src = tag['src'] return (src)
def get_urls_robtex(ip): request = urllib2.Request("http://ip.robtex.com/%s.html" % ip) request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)') robtex = bs(urllib2.urlopen(request)) websiteslist = [] tmp=robtex.findAll("span",{"id":re.compile("dns")}) a=bs(str(tmp)) for url in a('a'): if not url.string is None: websiteslist.append("http://"+url.string) return websiteslist
def update(self): if self.steamalias: data = urllib2.urlopen('http://steamcommunity.com/id/%s' % self.userid).read() else: data = urllib2.urlopen('http://steamcommunity.com/profiles/%s' % self.userid).read() data = bs(data, convertEntities=bs.HTML_ENTITIES) try: self.username = data.find(id="mainContents").h1.contents[0].strip() except Exception: return try: self.status = data.find(id='statusOnlineText').string self.status = 1 except Exception: pass if not self.status: try: self.game = data.find(id='statusInGameText').string.strip() self.status = 2 except Exception: pass if not self.status: try: if data.find('p', 'errorPrivate'): self.status = 3 except Exception: pass if not self.status: try: self.lastseen = data.find(id='statusOfflineText').string.replace('Last Online: ',"") self.status = 0 except Exception: pass if self.status == 2: # The user is in-game, retrieve the ip if possible try: friendurl = data.find(id='friendBlocks').div.div.div.a['href'] except Exception: return friendurl = friendurl +'/friends' data = data = urllib2.urlopen(friendurl).read() data = bs(data, convertEntities=bs.HTML_ENTITIES) try: self.server = data.find('a', text=self.username).parent.parent.span.find('a')['href'][16:] except Exception: pass
def __search_results(self, page): start = time.time() if page == 1: results = bs(urlopen(baseURL + queryString + self.searchTerm), parseOnlyThese=ss('a', 'result_primary_link')) else: results = bs(urlopen(baseURL + queryString + self.searchTerm + searchPageString + str(page)), parseOnlyThese=ss('a', 'result_primary_link')) for link in results.contents: if link['result-type'] == 'Talk' and not link[ 'href'] in self.listOfPosts: Investigator.__result(self, link['href']) print "__search_results Elapsed Time: %s" % ( time.time() - start), self.searchTerm, ' page: ', page
def crs(): for x in range(0,len(list_subject[0])): print str(x)+'|'+list_subject[1][x]+"|"+list_subject[2][x]+"|"+list_subject[3][x] select=raw_input("select the subect by entering a the serial number and if you want to choose faculty other than the one you registered add \"d\" at the end (eg: \"2d\" without the quotes)") if select[-1]=='d': select=int(select[0]) ch=facchoice(select) ch=ch.split(' - ') crsp=crpg%(list_subject[1][select],list_subject[3][select],ch[0]) fac=ch[1] else: crsp=crpg%(list_subject[1][int(select)],list_subject[3][int(select)],list_subject[6][int(select)]) fac=list_subject[5][int(select)] url=baseurl+crsp k=br.open(url) hh = k.read() souph = bs(hh) input = souph.findAll('input') name2 = input[2].get('name') name3 = input[3].get('name') name4 = input[4].get('name') value2 = input[2].get('value') value3 = input[3].get('value') value4 = input[4].get('value') formdat = { name2 : value2,name3 : value3,name4 : value4} data_encoded = urllib.urlencode(formdat) response = br.open('https://vtop.vit.ac.in/student/coursepage_view3.asp', data_encoded) htmla=response.read() soupa=bs(htmla) lst=[h for h in soupa.findAll('a')] link=[each.get('href') for each in lst] for x in link : try: u = urllib2.urlopen(baseurl+"/"+x) #Testing if link is valid and downloadable except: link.remove(x) path_folder=createfold(list_subject[1][int(select)]+'-'+list_subject[2][int(select)],fac) for x in range(0,len(link)): try: url = link[x] file_nam = lst[x].text ##print file_nam down(url,file_nam,path_folder) except Exception,e: print e pass print '*'*79
def main(): # Scrape all the comic data. There are 9 chapters with < 150 pages each comic = {} for chapter in range(10): for page in [str(x).zfill(2) for x in range(150)]: try: res = requests.get( 'http://www.casualvillain.com/Unsounded/comic/ch0%d/ch0%d_%s.html' % (chapter, chapter, page)) if res.ok: quiet = comic.setdefault(chapter, {}) comic[chapter][int(page)] = res.content except: continue # I know that since the indexes are ints, they are likely to be in order, # but sorting to be on the safe side. chapters = comic.keys() chapters.sort() for chapter in chapters: data = comic[chapter] pages = data.keys() pages.sort() if len(pages) != max(pages): print 'Missing pages from chapter %d' % chapter for page in pages: try: page_data = data[page] soup = bs(page_data) this_page = 'ch%s_%s.html' % (str(chapter).zfill(2), str(page).zfill(2)) try: next_link = soup.findAll("a", {"class": "forward"})[0]['href'] except: # At the end of chapters there is no link. next_link = 'ch%s_%s.html' % (str(chapter + 1).zfill(2), str(1).zfill(2)) comic_element = soup.find("div", {"id": "comic"}).find('img') link_soup = bs('<a href="%s">' % next_link) link_soup.find('a').insert(0, comic_element.extract()) soup.find("div", {"id": "comic"}).insert(0, link_soup) prettyHTML = soup.prettify() with open(this_page, 'w') as f: print >> f, prettyHTML except Exception as exc: print 'Bad chapter %d page %d %s' % (chapter, page, repr(exc))
def get_urls(br): raw_urls = [] src = br.page_source soup = bs(src) for a in soup.findAll('a', href=True): raw_urls.append(urlparse.urljoin(archive_link, a['href'])) return raw_urls
def parse_item(self, response): ''' 访问各新闻页面,获取各键值 :param response: :return: ''' logUtil.getLog().info('news url :%s' % response.url) item = FenghuoItem() root = bs(response.body) item['topPost'] = "1" item["site_id"] = "13" item['website_id'] = '' item["site_name"] = '通山县机构编制网' item["area"] = "958" item["site_weight"] = "2" item['countryid'] = "1156" item['province'] = "1673" item['city'] = "136" item["ip"] = socket.gethostbyname("www.tsxbb.gov.cn") item["site_url"] = "www.tsxbb.gov.cn" item["forumurl"] = response.meta['forumurl'] item["site_cls"] = '1' item["url"] = response.url item["subname"] = root.find("span", attrs={ "class": "text14h" }).find("a", attrs={ "href": "../" }).text item["title"] = root.find("td", attrs={"class": "textbiaoti"}).text str = root.find("td", attrs={"class": "text12hui"}).text str = str[str.index('20'):] item["pubdate"] = str[:str.index(' ') - 1] try: str = str[str.index('su = ') + 6:] item["website_id"] = str[:str.index(';') - 1] except: item["website_id"] = "" styles = root.find("div", attrs={ "class": "TRS_Editor" }).findAll("style") for style in styles: style.clear() #替换所有图片标签 imgs = root.find("div", attrs={"class": "TRS_Editor"}).findAll("img") for img in imgs: img.replaceWith(img.prettify()) item["txt"] = root.find("div", attrs={ "class": "TRS_Editor" }).text.replace("\r\n", "$*huanhang*$").replace("\n", "$*huanhang*$").replace( "\"", "'").replace("<br />", "$*huanhang*$") item["txt_len"] = len(item["txt"]) item["domain_1"] = "tsxbb.gov.cn" item["domain_2"] = "" item["snatch_time"] = datetime.datetime.now().__format__("") item["task_id"] = response.meta['task_id'] self.saveData.saveContext(item)
def search_album(singer, song): url = 'http://www.allmusic.com/search/song/' + singer + '%20' + song i=0 for x in range(10) : try: website_html = requests.get(url).text soup = bs(website_html) except requests.exceptions.RequestException as e: i+=1 continue break if i>=9: return -1 compare = 0 temp = [] for line in soup.findAll("li",{'class':"song"}): for par in line.findAll("div"): if(par.get('class')=="title" and par.text.replace('\"','').lower()==song.lower()): compare += 1 temp.append(par) continue if(par.get('class')=="performers" and par.find('a').contents[0].lower()==singer.lower()): compare += 1 continue compare = 0 if len(temp) != 0: for a in temp[0].findAll('a',href = True): return search_album_name(a['href']) return -1
def manageregex(self, pattern): out_folder = constants.fileloc + "files/" try: os.mkdir(out_folder) except: pass if self.direct: urlretrieve(self.url, out_folder + "/" + self.url.split()[-2:]) else: soup = bs(urlopen(self.url)) parsed = list(urlparse.urlparse(self.url)) tota = soup.findAll("a") tot = len(tota) n = 0 pat = re.compile(pattern) for a in tota: n += 1 try: if pat.match(str(a['href'])): filename = a["href"].split("/")[-1] parsed[2] = a["href"] outpath = os.path.join(out_folder, filename) if a['href'].lower().startswith("http"): urlretrieve(a['href'], outpath) else: urlretrieve(urlparse.urljoin(self.url, a['href']), outpath) yield (n*100)/tot except: pass
def manageimages(self): out_folder = constants.fileloc + "images/" try: os.mkdir(out_folder) except: pass if self.direct: urlretrieve(self.url, out_folder + "/" + self.url.split()[-2:]) else: soup = bs(urlopen(self.url)) parsed = list(urlparse.urlparse(self.url)) totim = soup.findAll("img") tot = len(totim) n = 0 for image in totim: n += 1 filename = image["src"].split("/")[-1] parsed[2] = image["src"] outpath = os.path.join(out_folder, filename) try: if image["src"].lower().startswith("http"): urlretrieve(image["src"], outpath) else: urlretrieve(urlparse.urljoin(self.url, image["src"]), outpath) yield (n*100)/tot except: pass
def get_episodes(season,season_num): url=season if domain not in url: url=domain + season html=read_url(url) soup=bs(html) tag=soup.find('div',{'class':'Episode'}) reg=re.compile('<a href="(.+?)".+?>') links=list(re.findall(reg,str(tag))) reg2=re.compile('</strong> (\d+) - (.+?)</a>') names=re.findall(reg2,str(tag)) out=[] last_num=0 spec=addon.get_setting('specials') if spec=='false': for i in range(len(links)): check=int(names[i][0])-last_num if 'special:' not in names[i][1].lower() and check==1: out+=[[links[i],names[i][1],season_num,names[i][0]]] last_num=int(names[i][0]) imdb=re.compile('[\"\']http://www.imdb.com/title/(.+?)[\"\']') imdb=re.findall(imdb,str(soup))[0] return imdb,out else: for i in range(len(links)): out+=[[links[i],names[i][1],season_num,names[i][0]]] imdb=re.compile('[\"\']http://www.imdb.com/title/(.+?)[\"\']') imdb=re.findall(imdb,str(soup))[0] return imdb,out
def search_album(singer, song): url = 'http://www.allmusic.com/search/song/' + singer + '%20' + song i = 0 for x in range(10): try: website_html = requests.get(url).text soup = bs(website_html) except requests.exceptions.RequestException as e: i += 1 continue break if i >= 9: return -1 compare = 0 temp = [] for line in soup.findAll("li", {'class': "song"}): for par in line.findAll("div"): if (par.get('class') == "title" and par.text.replace('\"', '').lower() == song.lower()): compare += 1 temp.append(par) continue if (par.get('class') == "performers" and par.find('a').contents[0].lower() == singer.lower()): compare += 1 continue compare = 0 if len(temp) != 0: for a in temp[0].findAll('a', href=True): return search_album_name(a['href']) return -1
def PLAY_FULL(name, url, iconimage): albumlist = [] link = client.request(url) soup = bs(link) threads = [] album_icon = iconimage print("ALBUM ICON", album_icon) r = soup.find('div', {'class': 'artist-songs'}) global count reg = re.compile( '<div class="song-name"><a href="([^"]+)">(.*?)</a></div>') result = re.findall(reg, str(r)) count = 0 playlist = xbmc.PlayList(0) playlist.clear() progressDialog = control.progressDialog progressDialog.create('Karma', '') progressDialog.update(0) for url, title in result: if progressDialog.iscanceled(): break count += 1 url = re.sub('/track/', '/download/', url) url = base_url + url title = client.replaceHTMLCodes(title) progress = (float(count) / float(len(result))) * 100 progressDialog.update(int(progress), 'Retrieving and Checking Songs...', title) w = workers.Thread(fetch_album, url, title, album_icon) w.start() w.join() xbmc.Player().play(playlist)
def search_album_name(url): if url == '': return -1 i = 0 for x in range(10): try: website_html = requests.get(url).text soup = bs(website_html) except requests.exceptions.RequestException as e: i += 1 continue break if i >= 9: return -1 albums = [] for a in soup.findAll("td", {'class': "artist-album"}): if 'Various Artists' in a.find("span", {'itemprop': "name"}).text: continue for b in a.findAll("div", {'class': "title"}): albums.append(b.find('a').text) if len(albums) != 0: return albums else: return -1
def tp(self, url, tweet): ''' api_location = {} try: json_reply= simplejson.load(urllib.urlopen("http://api.twitpic.com/2/media/show.json?id="+url.path[1:])) if 'location' in json_reply : api_location['from'] = 'twitpic_api' api_location['time'] = json_reply['timestamp'] api_location['coordinates'] = json_reply['location'] except simplejson.JSONDecodeError: #print "error produced by http://api.twitpic.com/2/media/show.json?id="+url.path[1:] ''' try: #Handle some bad HTML in twitpic html = urllib.urlopen(url.geturl()).read() html = html.replace('</sc"+"ript>', '') soup = bs(html) #Grabs the photo from cloudfront temp_file = os.path.join(self.photo_dir, url.path[1:]) photo_url = soup.find(attrs={ "id": "content" }).find(src=re.compile("cloudfront"))['src'] urllib.urlretrieve(photo_url, temp_file) return [self.exif_extract(temp_file, tweet)] except Exception: err = 'Error trying to download photo' self.errors.append({ 'from': 'twitpic', 'tweetid': tweet.id, 'url': url.geturl(), 'error': err }) return []
def parse_pages(self, response): self.log('Hi, this is the second page %s' % response.url) root = bs(response.body) forumurl = response.url pageid = forumurl[forumurl.index("orum-") + 5:forumurl.index("-1.html")] print forumurl try: pageText = root.find("div", attrs={ "class": "pg" }).find("span").text pageText = pageText[pageText.index("/") + 2:] totalpage = pageText[:pageText.index(" ")] except: totalpage = 1 root_url = "http://www.baotuowang.com/forum.php?mod=forumdisplay&fid=" for i in range(1, int(totalpage) + 1): url = root_url + pageid + "&page=" + str(i) yield scrapy.Request(url, self.parse_page, meta={ 'forumurl': forumurl, 'pageid': pageid })
def check_spider(): new = 0 page = 1 while new > 20: new = 0 r = requests.get(url) root = bs(r.text) page += 1 url = "http://www.cntongshan.com/News/NewsList-0-2-AA-p" + str( page) + ".html" uls = root.find("div", attrs={ "class": "ListMain" }).findAll("ul", attrs={"class": "l_l"}) for ul in uls: lis = ul.findAll("li") for li in lis: div = li.find("div") curl = div.find("a").get("href") li.find("div").clear() datestr = li.text month = datestr[:datestr.index("月")] day = datestr[datestr.index("月") + 1:datestr.index("日")] hour = datestr[datestr.index("日") + 2:datestr.index(":")] minute = datestr[datestr.index(":") + 1:] date_t = datetime(2015, int(month), int(day), int(hour), int(minute)) if date_t > last_t: new += 1 print date_t spider_cnts(curl)
def get_ttv(): url='http://www.acesportstream.com' url=read_url(url) soup=bs(url) channels1=soup.find('div',{'id':'hd'}).findAll('a') channels2=soup.find('div',{'id':'blue'}).findAll('a') for channel in channels1: link=channel['href'] img=channel.find('img')['src'] name=clean(cleanex(channel['title'])) url = build_url({'mode': 'open_ttv_stream','url':link, 'name':name.encode('ascii','ignore')}) li = xbmcgui.ListItem('%s'%name, iconImage=img) li.setProperty('IsPlayable', 'true') xbmcplugin.addDirectoryItem(handle=addon_handle, url=url, listitem=li) for channel in channels2: link=channel['href'] img=channel.find('img')['src'] name=clean(cleanex(channel['title'])) url = build_url({'mode': 'open_ttv_stream','url':link, 'name':name.encode('ascii','ignore')}) li = xbmcgui.ListItem('%s'%name, iconImage=img) li.setProperty('IsPlayable', 'true') xbmcplugin.addDirectoryItem(handle=addon_handle, url=url, listitem=li) xbmcplugin.endOfDirectory(addon_handle)
def download( url ) : ''' Pull the page and parse it into the pieces we need. ''' cookieJar = cookielib.LWPCookieJar() if os.path.isfile( kCookieFile ) : cookieJar.load( kCookieFile ) else : cookieJar.save( kCookieFile ) opener = urllib2.build_opener( urllib2.HTTPCookieProcessor( cookieJar )) link = opener.open( url ) page = link.read() soup = bs( page ) scores = soup.findChildren( 'table', { "class" : "scores" } ) for i, aSection in enumerate( scores ) : scoresArray = [] away = "" home = "" teams = aSection.findChildren( None, { "class" : "yspscores team" } ) for i, aTeam in enumerate( teams ) : name = aTeam.findChild( 'a' ) if 0 == i : away = name.text else : home = name.text qtrScores = aSection.findChildren( None, { "class" : "yspscores" } ) for i, qtr in enumerate( qtrScores ) : scoresArray.append( cleanText( qtr.text )) printScores( away, home, scoresArray )
def Tureng(word): f = urllib.urlopen("http://tureng.com/en/turkish-english/" + str(word)) soup = bs(f) dummy = 0 emocan = 0 for string in soup.findAll('a'): if int(dummy) > 20: wflag = 0 for x in range(0, len(uList)): if string.string == uList[x]: wflag = 1 break if wflag == 0: if string.string != None: for tr in range(0, len(tr_Error)): if tr_Error[tr] in string.string: string.string = string.string.replace( tr_Error[tr], fix[tr]) if emocan == 0: try: print string.string.decode('utf-8'), ' ---> ', except: print string.string, ' ---> ', emocan = 1 else: try: print string.string.decode('utf-8') except: print string.string emocan = 0 dummy += 1
def get_data_from_page(schedule): # get data from each tamu bus schedule page for route_page in route_pages: bus_page = urllib2.urlopen('http://transport.tamu.edu/busroutes/Route' + route_page + '.aspx') data = bs(bus_page) # store schedule for this particular route route_bus_stops = [] # get all bus stops and their schedule for table in data.findAll('table'): #get bus stop names for tr in table.findAll('tr')[1]: route_bus_stops.append(tr.string.strip()) #get schedule for tr in table.findAll('tr')[2:]: for td in tr.findAll('td'): if td.string not in blanks and td.nextSibling.string not in blanks: schedule.append((route_bus_stops[tr.index(td)], td.string, route_bus_stops[tr.index(td.nextSibling)], td.nextSibling.string, route_page)) # print route_bus_stops[tr.index(td)], td.string, route_bus_stops[tr.index(td.nextSibling)], td.nextSibling.string, route_page print print 'Schedule:', len(schedule)
def save_wordlist(raw_page): soup = bs(raw_page) wordlist = str.split(soup.__str__()) f = open(PATH, 'a') for word in wordlist: f.write(word+'\n') f.close()
def send_pings(post): logger.debug("send_pings entered") if settings.DEBUG: logger.warn("Not sending pings in debug") return if post.status == 'publish': # check for outgoing links. target_urls = [] logger.debug("post.body") soup = bs(post.get_formatted_body()) logger.debug(str(soup)) for a in soup.findAll('a'): target_url = a.get('href', None) if target_url: logger.info("Got URL:" + a.get('href')) target_urls.append(target_url) logger.info("Checking out %d url(s)" % len(target_urls)) for url in target_urls: pb_urls, tb_urls = get_ping_urls(url) for pb in pb_urls: logger.info("Got pingback URL: %s" % pb) pingback_ping(post.get_absolute_url(), pb, post=post, outgoing=True) for tb in tb_urls: logger.info("Got trackback URL: %s" % url) trackback_ping(post.get_absolute_url(), tb, post=post, outgoing=True)
def get_links_putlocker(show, season, episode): show = show.replace(' 2014', '').replace(' 2015', '') show = show.rstrip().replace(' ', '-').replace('!', '').replace( '?', '').replace('--', '') url = 'http://putlocker.is/watch-%s-tvshow-season-%s-episode-%s-online-free-putlocker.html' % ( show, season, episode) print(url) read = read_url(url) soup = bs(read) table = soup.findAll( 'table', { 'class': 'table', 'border': '0', 'cellspacing': '0', 'cellpadding': '0', 'width': '100%' })[2] trs = table.findAll('tr') results = [] reg = 'http://www.(.+?)/' pat = re.compile(reg) for i in range(len(trs)): try: link = trs[i].find('td', {'width': '100%'}).find('a')['href'] title = re.findall(pat, link)[0] results.append([title, link]) except: pass return results
def managefiles(self): out_folder = constants.fileloc + "files/" try: os.mkdir(out_folder) except: pass if self.direct: urlretrieve(self.url, out_folder + "/" + self.url.split()[-2:]) else: soup = bs(urlopen(self.url)) parsed = list(urlparse.urlparse(self.url)) tota = soup.findAll("a") tot = len(tota) n = 0 for a in tota: n += 1 filetype = a['href'].split(".")[-1] if filetype in self.format: filename = a["href"].split("/")[-1] parsed[2] = a["href"] outpath = os.path.join(out_folder, filename) try: if a['href'].lower().startswith("http"): urlretrieve(a['href'], outpath) else: urlretrieve(urlparse.urljoin(self.url, a['href']), outpath) yield (n*100)/tot except: pass
def get_category(site, page): if page == '1': pass else: site = site + '%s/' % page reg = 'href="(.+?)"' pat = re.compile(reg) req = urllib2.Request( url=site, headers={ 'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0' }) request = urllib2.urlopen(req) html = request.read() soup = bs(html) linksout = [] tags = soup.findAll('article') for i in range(len(tags)): names = tags[i].find('h1', {'class': 'entry-title'}) h = HTMLParser.HTMLParser() ime = h.unescape(names.getText()) link = re.findall(pat, str(names))[0] img = tags[i].find('img')['src'] ps = len(tags[i].findAll('p')) - 2 linksout += [[link, ime, img]] return linksout
def search_genre(singer, album): #genre = d.search(title, artist=singer) url = 'http://www.allmusic.com/search/albums/' + album + '%20' + singer i = 0 for x in range(10): try: website_html = requests.get(url).text soup = bs(website_html) except requests.exceptions.RequestException as e: i+=1 continue break if i>=9: return -1 _genres={} for a in soup.findAll("div",{'class':"genres"}): for b in a.text.split(','): _genres[b] = 1 break if _genres is not None: return _genres.keys() else: return -1
def get_iwatch_links(url): try: links=[] hosts=[] html=read_url(url) soup=bs(html) table=soup.find('table',{'id':'streamlinks'}) trs=table.findAll('tr') trs.pop(0) for i in range(len(trs)): try: item=trs[i] link=item.find('td').find('a')['href'] host=item.find('td').find('a').getText().lstrip().rstrip().lower().lstrip('.') ind=host.index('<') links.append(link) hosts.append(host[:ind]) except: pass return links,hosts except: return [],[]
def search_album_name(url): if url == '': return -1 i=0 for x in range(10) : try: website_html = requests.get(url).text soup = bs(website_html) except requests.exceptions.RequestException as e: i+=1 continue break if i>=9: return -1 albums = [] for a in soup.findAll("td",{'class':"artist-album"}): if 'Various Artists' in a.find("span",{'itemprop':"name"}).text: continue for b in a.findAll("div",{'class':"title"}): albums.append(b.find('a').text) if len(albums)!=0: return albums else: return -1
def discover_feeds(url): """ Returns a list of possible candidate feeds found or an error message. Results are returned as a two-tuple of (success, data) where data will be an error message if not successful. """ try: data = get_resource(url) logger.debug("got data") soup = bs(data) link_list = soup('link') candidates = [] for link in link_list: mime = link.get('type') if mime in ['application/atom+xml', 'application/rss+xml']: uri = complete_uri(url, link.get('href')) candidates.append((link.get("title", uri), uri)) if not candidates: return False, "No feeds found." return True, candidates except urllib2.URLError as e: if hasattr(e, 'reason'): msg = "I failed to reach a server: %s" % e.reason return False, msg elif hasattr(e, 'code'): msg = "The server couldn't fulfill our request (%s %s)" % (e.code, httplib.responses[e.code]) return False, msg return False, "Unknown error."
def warwick_ical(): """Declare variables""" warwickurl = "http://www.eng.warwick.ac.uk/cgi-bin/timetable" bot = httpbot.HttpBot() bot.POST('https://websignon.warwick.ac.uk/origin/slogin?providerId=urn%3Awww.eng.warwick.ac.uk%3Ageneral%3Aservice&target=http://www.eng.warwick.ac.uk/cgi-bin/timetable', {'userName': warwickusername, 'password':warwickpassword}) response = bot.GET(warwickurl) soup = bs(response) try: ical_url = soup.findAll('a')[0]['href'].replace('webcal','http') except KeyError: while 1: again = raw_input('Login Error! Try again? (y) ') if again == 'y': return warwick_ical() break elif again == 'n': exit(0) else: print 'Please enter y or n' return urllib2.urlopen(urllib2.Request(ical_url)).read()
def __user(self, user): try: start = time.time() inQueue = Queue() outQueue = Queue() processes = [] links = bs(urlopen(baseURL + user + '/activity'), parseOnlyThese=ss('a', href=re.compile('/post/a.'))) for link in links.contents: if link['href'] not in self.visitedPosts: inQueue.put(link['href']) self.visitedPosts.append(link['href']) for i in range(cpu_count()): p = Process(target=Investigator.__posts, args=(self, inQueue, outQueue)) p.start() processes.append(p) inQueue.put('STOP') for p in processes: p.join() outQueue.put('STOP') for post in iter(outQueue.get, 'STOP'): self.listOfPosts.append(post) print "__user Elapsed Time: %s" % (time.time() - start), user except HTTPError: print 'HTTPError:', user
def get_livefoot(url,name): names,links=[],[] html=read_url(url) soup=bs(html) tag=soup.find('div',{'id':'maininner'}) tag=tag.find('div',{'class':'content clearfix'}) trs=tag.findAll('tr') for item in trs: try: language=item.findAll('td')[0].getText() txt=item.findAll('td')[1].getText() except: language='[N/A]' txt='' if language=='': language='[N/A]' if 'acestream' in txt.lower() or 'sopcast' in txt.lower(): link=item.findAll('td')[1].find('a')['href'] title='%s %s'%(txt,language) links+=[link] names+=[title] else: pass if links!=[]: dialog = xbmcgui.Dialog() index = dialog.select('Select a channel:', names) if index>-1: name=names[index] url=links[index] play_livefoot(url,name) else: xbmcgui.Dialog().ok('No stream','No stream available yet!')
def spider(): print "start!" pwd = "/Users/bohaohan/iss/商务智能/code/img/" tail = ".png" url = "http://www.yeslux.com/pinpai.html" r = requests.get(url) r.encoding = 'gb2312' with open(pwd + "a" + tail, 'wb') as fd: for chunk in r.iter_content(): fd.write(chunk) root = bs(r.text) div = root.find("div", attrs={'class': 'brand_main'}) lis = div.findAll("li") for li in lis: img = li.find('img') name = img.get("alt") src = img.get("src") ir = requests.get(src, stream=True) with open(pwd + name + tail, 'wb') as fd: for chunk in ir.iter_content(): fd.write(chunk) print name, src, "has been downloaded" print "finished!"
def get_reddit_news(): url = 'https://www.reddit.com/r/svenskpolitik/top/?sort=top&t=hour' b = mechanize.Browser() b.addheaders = [('User-agent', 'SvenskPolitikReaderBot 1.0')] response = b.open(url).read() soup = bs(str(response)) entries = soup.findAll("div", {"class" : "entry unvoted"}) for entry in entries: title = entry.a.text link = entry.a.get('href') domain = entry.span.text print title print link print domain if (domain != '(self.svenskpolitik)'): databaseConnector.insert_news(title, link) else: print 'Avoided self post!'
def doThreadComments( soup ) : ''' doThreadComments needs a description... ''' commentBlock = soup.findChild( None, { "class" : "posts" }) commentRows = commentBlock.findAll( None, { "class" : "postbit postbitim postcontainer old" }) for i, commentRow in enumerate( commentRows ) : # print commentRow userObj = commentRow.findChild( None, { "class" : "popupmenu memberaction" }) poster = userObj.findChild( None, { "class" : re.compile( 'username' ) } ) poster = cleanMsg( poster ) date = cleanMsg( commentRow.findChild( None, { "class" : "date" })) date = date.replace( " ", " " ) print poster print date print # brute force strip all HTML data from message for now msgObj = commentRow.findChild( None, { "class" : "postcontent restore" }) #msg = ''.join( bs( str( msgObj ) ).findAll( text=True )).strip() msg = cleanText( ''.join( bs( str( msgObj ) ).findAll( text=True )).strip() ) print msg.encode( 'ascii', 'ignore' ) print " =============================="
def get_links_putlocker(show,season,episode): show=show.replace(' 2014','').replace(' 2015','') show=show.rstrip().replace(' ','-').replace('!','').replace('?','').replace('--','') url='http://putlocker.is/watch-%s-tvshow-season-%s-episode-%s-online-free-putlocker.html'%(show,season,episode) print(url) read=read_url(url) soup=bs(read) table=soup.findAll('table',{'class':'table', 'border':'0','cellspacing':'0', 'cellpadding':'0', 'width':'100%'})[2] trs=table.findAll('tr') results=[] reg='http://www.(.+?)/' pat=re.compile(reg) for i in range(len(trs)): try: link=trs[i].find('td',{'width':'100%' }).find('a')['href'] title=re.findall(pat,link)[0] results.append([title,link]) except: pass return results
def search_genre(singer, album): #genre = d.search(title, artist=singer) url = 'http://www.allmusic.com/search/albums/' + album + '%20' + singer i = 0 for x in range(10): try: website_html = requests.get(url).text soup = bs(website_html) except requests.exceptions.RequestException as e: i += 1 continue break if i >= 9: return -1 _genres = {} for a in soup.findAll("div", {'class': "genres"}): for b in a.text.split(','): _genres[b] = 1 break if _genres is not None: return _genres.keys() else: return -1