def get_featured(self): url = 'http://www.lds.org/media-library/video?lang=eng' soup = BeautifulSoup(make_request(url), convertEntities=BeautifulSoup.HTML_ENTITIES) for i in soup.find('div',{'class':'feature-box'}).find('ul',{'class':"feature-preview"})('li'): fc = i.find('div',{'class':'feature-control'}) name = fc.findNext('h3').getText().encode('utf8') desc = fc.p.getText().encode('utf8') u = fc.findNext('a')['href'] thumb = "https://www.lds.org" + urllib.quote(i.findNext('img')['src']) if 'media-library/video/categories' in u: mode = 2 else: mode = 4 self.add_dir(thumb,{'Title':name,'Plot':desc},{'name':name,'url':u,'mode':mode},thumb) for i in soup.find('ul',{'class':'media-list'})('li'): name = i.findNext('h4').a.getText().encode('utf8') desc = i.findNext('p').getText().encode('utf8') u = i.find('a',{'class':'video-thumb-play'})['href'] thumb = i.findNext('img')['src'] try: soup2 = BeautifulSoup(make_request(u), convertEntities=BeautifulSoup.HTML_ENTITIES) for j in soup2.find('div',{'class':'galleryMeta'})('p'): try: if "for downloads" in j.a.getText(): u = j.a['href'] break except: continue else: continue except: print "Couldn't get video link for %s. %s" % (name,traceback.format_exc().splitlines()[-1]) continue if 'media-library/video/categories' in u: mode = 2 else: mode = 4 self.add_dir(thumb,{'Title':name,'Plot':desc},{'name':name,'url':u,'mode':mode},thumb)
def getpresentationdetails(sender, **kwargs): print "Pre Save!" #print sender model = kwargs['instance'] # fetch the presentation url try: import urllib from BeautifulSoup import BeautifulSoup as BS html = urllib.urlopen(kwargs['instance'].url).read() bs = BS(html) # find the let's get the media url presurl = bs.find('link', rel='media:presentation') print "* Presentation: " + presurl['href'] # and the thumbnail thumburl = bs.find('link', rel='image_src') print "* Thumbnail: " + thumburl['href'] # and the author ame creator = bs.find('meta', property='dc:creator') print "* Creator: " + creator['content'] title = bs.find('meta', property="media:title") print "* Content: " + title['content'] except Exception, e: raise e
def fetch_page(link_id): link = Link.objects.get(pk=link_id) url = link.url headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'} req = urllib2.Request(url, None, headers) try: html = urllib2.urlopen(req).read() soup = BeautifulSoup(html) link.title = soup.find('title').text favicon = soup.find('link', rel='shortcut icon') if favicon and favicon['href']: link.favicon = urljoin(url, favicon['href']) for item in soup.findAll('meta'): if item.get('name', '').lower() in ('description', 'og:description') and item.get('content', ''): link.description = item.get('content', '') except Exception as e: link.is_error = 1 link.error_text = e.reason.__str__() link.save()
def _retrieve_product(cls, url): browser = mechanize.Browser() soup = BeautifulSoup(browser.open(url)) result = {} container = soup.find('div', 'detalle_tienda') result['name'] = container.find('h1').string category = soup.find('div', 'selectcion_cat').find('div', 'txt').string result['category'] = category store_data = container.findAll('li') result['level'] = store_data[2].string result['local'] = store_data[0].string phone = store_data[1].string.split(u'Teléfono ')[1] if not phone: phone = None result['phone'] = phone result['picture'] = cls.base_url + container.find('img')['src'] store_url = None link_tag = store_data[3].find('a') if link_tag.string: store_url = link_tag['href'] result['store_url'] = store_url return result, {}
def _parse(self, html): soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, fromEncoding='utf-8') try: p_tags = soup.find('div', attrs={'itemprop':'articleBody'}).findAll('p') except AttributeError: self.real_article = False return main_body = '\n'.join([p.getText() for p in p_tags]) self.body = main_body self.meta = soup.findAll('meta') self.title = soup.find('meta', attrs={'property':'og:title'}).get('content') author = soup.find('p', attrs={'itemprop':'name'}) if author: self.byline = author.getText() else: self.byline = '' datestr = soup.find('time', attrs={'itemprop':'datePublished'}).get('datetime') new_dt = datestr[:19] datet = datetime.strptime(new_dt, '%Y-%m-%dT%H:%M:%S') self.date = datet.strftime(DATE_FORMAT)
class GoogleCodeProjectExtractor(object): RE_REPO_TYPE = re.compile(r'(svn|hg|git)') PAGE_MAP = { 'project_info': 'http://code.google.com/p/%s/', 'source_browse': 'http://code.google.com/p/%s/source/browse/', } LICENSE_MAP = defaultdict(lambda:'Other/Proprietary License', { 'Apache License 2.0': 'Apache Software License', 'Artistic License/GPL': 'Artistic License', 'Eclipse Public License 1.0': 'Eclipse Public License', 'GNU GPL v2': 'GNU General Public License (GPL)', 'GNU GPL v3': 'GNU General Public License (GPL)', 'GNU Lesser GPL': 'GNU Library or Lesser General Public License (LGPL)', 'MIT License': 'MIT License', 'Mozilla Public License 1.1': 'Mozilla Public License 1.1 (MPL 1.1)', 'New BSD License': 'BSD License', 'Other Open Source': 'Other/Proprietary License', }) DEFAULT_ICON = 'http://www.gstatic.com/codesite/ph/images/defaultlogo.png' def __init__(self, project, page='project_info'): gc_project_name = project.get_tool_data('google-code', 'project_name') self.url = self.PAGE_MAP[page] % urllib.quote(gc_project_name) self.project = project self.page = BeautifulSoup(urllib2.urlopen(self.url)) def get_short_description(self): self.project.short_description = self.page.find(itemprop='description').string.strip() def get_icon(self): icon_url = urljoin(self.url, self.page.find(itemprop='image').attrMap['src']) if icon_url == self.DEFAULT_ICON: return icon_name = urllib.unquote(urlparse(icon_url).path).split('/')[-1] fp_ish = urllib2.urlopen(icon_url) fp = StringIO(fp_ish.read()) M.ProjectFile.save_image( icon_name, fp, fp_ish.info()['content-type'].split(';')[0], # strip off charset=x extra param, square=True, thumbnail_size=(48,48), thumbnail_meta={'project_id': self.project._id, 'category': 'icon'}) def get_license(self): license = self.page.find(text='Code license').findNext().find('a').string.strip() trove = M.TroveCategory.query.get(fullname=self.LICENSE_MAP[license]) self.project.trove_license.append(trove._id) def get_repo_type(self): repo_type = self.page.find(id="crumb_root") if not repo_type: raise Exception("Couldn't detect repo type: no #crumb_root in " "{0}".format(self.url)) re_match = self.RE_REPO_TYPE.match(repo_type.text.lower()) if re_match: return re_match.group(0) else: raise Exception("Unknown repo type: {0}".format(repo_type.text))
def _on_page(self, page): if not page: import ipdb ipdb.set_trace() soup = BeautifulSoup(page) if not soup.find('a', text='Log in'): event = soup.find('b', text='Something has happened!') if event: cell = event.findParent('table').findAll('td')[2] text = ''.join([x.text if hasattr(x, 'text') else x for x in cell.childGenerator()]) self._logger.info("Something has happned: %s", text) try: self._neopoints = get_np(soup) except NoNpInPage: pass return soup self._logger.info('Need to login. Using account %s', self._username) data = dict(username=self._username, password=self._password, destination=soup.find( 'input', attrs=dict(name='destination'))['value']) d = self._browser.post('http://www.neopets.com/login.phtml', data) d.addCallback(self._on_login) return d
def transcripts(self, organism, gene_id): """Retrieve a list of (transcript, protein) ids for the given gene_id. """ txs = [] ps = [] valid_gene_starts = ["EN", "FB", "AA", "AG"] with self._get_open_handle("Gene", "Summary", organism, gene_id) as in_handle: soup = BeautifulSoup(in_handle) tx_info = soup.find("table", {"id" : "transcripts"}) if tx_info is None: tx_info = soup.find(True, {"id" : "transcripts_text"}) #print tx_info tx_links = tx_info.findAll("a", href = re.compile("Transcript/Summary")) for tx_link in tx_links: if tx_link.string and tx_link.string[:2] in valid_gene_starts: txs.append(tx_link.string) p_links = tx_info.findAll("a", href = re.compile("Transcript/ProteinSummary")) for p_link in p_links: if p_link.string: ps.append(p_link.string) assert len(txs) == len(ps), (organism, gene_id, txs, ps) return zip(txs, ps)
def check_login(): login = __settings__.getSetting("Login") password = __settings__.getSetting("Password") if len(login) > 0: http = GET(httpSiteUrl, httpSiteUrl) if http == None: return None beautifulSoup = BeautifulSoup(http) userPanel = beautifulSoup.find('a', {"id": "loginlink"}) if userPanel == None: os.remove(cookiepath) loginResponse = GET(httpSiteUrl, httpSiteUrl, { 'login': '******', 'login_name': login, 'login_password': password, 'submit': 'Вход' }) loginSoup = BeautifulSoup(loginResponse) userPanel = loginSoup.find('a', {"id": "loginlink"}) if userPanel == None: showMessage('Login', 'Check login and password', 3000) else: return userPanel.text.encode('utf-8', 'cp1251') else: return userPanel.text.encode('utf-8', 'cp1251') return None
def urlVoid(s): """ API info: http://blog.urlvoid.com/urlvoid-api-v2-0/ Restrictions: < 1,0000 per day * if "-1" is returned it means the domain has not been yet scanned """ print (header("URLvoid")) api_key = "" if not api_key: print "[!] You must configure your URLvoid API key" else: url = "http://api.urlvoid.com/index/exec/" parameters = {"domains": s, "api": api_key, "go": 'Check'} data = urllib.urlencode(parameters) try: page = urllib2.urlopen(url, data) soup = BeautifulSoup(page) new_date = datetime.fromtimestamp(int(soup.find("details")['last_scan'])).strftime("%b %d %Y") print "Last Scan :",new_date detect_cnt = soup.find("details")['detected'] if detect_cnt == "-1": print "Not scanned yet" else: print "Detected :",detect_cnt if detect_cnt > "0": print "Detections :",soup.find("details")['lists_detected'] except Exception, msg: print msg
def placeFromMap(listing) : soup = BeautifulSoup(listing) geo_location_div = soup.find('div', {'id' : 'map'}) if geo_location_div : latitude = geo_location_div['data-latitude'] longitude = geo_location_div['data-longitude'] geolocation = (float(latitude), float(longitude)) else : geolocation = None address_div = soup.find('div', {'class' : 'mapaddress'}) if address_div : address = address_div.text else : address = None posting_title_div = soup.find('h2', {'class' : 'postingtitle'}) if posting_title_div : match = re.search(r'.*\((.+?)\)$', posting_title_div.text.strip()) if match : end_label = match.group(1) else : end_label = None else : end_label = None return address, geolocation, end_label
def ParsePage(self,id): resp = urllib.urlopen( self.main_url % id ); soup = BeautifulSoup(resp.read(),fromEncoding="euc-kr") self.meta.m_id = id self.meta.m_title = soup.find('h2').string self.meta.m_artist = [ soup.find("dt",id="artistName").span.a.string ] strain = SoupStrainer("div",{"class" : "album_info"}) sect = soup.find(strain) self.meta.m_thumb = sect.find("img",{"id":"albumBigThumb"})['src'] self.meta.m_genres = sect.find("img",alt=u"장르").parent.nextSibling.nextSibling.next.string.strip().split('/') self.meta.m_release = sect.find("img",alt=u"발매일").parent.nextSibling.nextSibling.next.string.strip() self.meta.m_rating = float( sect.find("span",{"class":"text_point"}).string ) self.meta.m_review = ''.join(soup.find("div", id="albumDesc").findAll(text=True)).strip() self.meta.m_review = self.meta.m_review.replace("&","&") self.meta.m_review = self.meta.m_review.replace("'","'").replace("–","-") self.meta.m_review = unicode(self.meta.m_review, 'utf-8') self.meta.m_tracks = [] for item in soup.findAll("td",{"class" : "num"}): pos = int( item.string ) track = item.findNextSiblings('td')[1].a.string self.meta.m_tracks.append( (pos,track) ) return self.meta
def ParseSeriesCastPage(self,id): resp = urllib.urlopen( self.cast_url % id ); soup = BeautifulSoup(resp.read(),fromEncoding="utf-8") pt = soup.find("h5",text=re.compile(u"^\s*출연\s*$")) if pt: for item in pt.parent.parent.findAll("dl"): name = item.find('img')['alt'].strip() role = item.find('span',{"class" : "etcs"}).string.strip() if role.rfind(u" 역") >= 0: role = role[:role.rfind(u" 역")] else: role = '' self.meta.s_actors.append( (name,role) ) pt = soup.find("h5",text=re.compile(u"^\s*제작진\s*$")) if pt: for item in pt.parent.parent.findAll('li'): if item.contents[0].string.startswith(u"극본"): for person in item.contents[1:]: name = person.string.strip() if name: self.meta.s_writers.append(name) elif item.contents[0].string.startswith(u"연출"): for person in item.contents[1:]: name = person.string.strip() if name: self.meta.s_directors.append(name)
def makeHTMLQuestion(fn, htmldata): soup = BeautifulSoup(htmldata) #add JS soup.find('body')['onload'] = "populateAssignmentID('myAssignmentId')" soup.find('head').insert(0, SUBMIT_JS) #replace forms forms = soup.findAll('form') if forms: for form in forms: if not form.has_key('method'): form['method'] = 'POST' if not form.has_key('action'): if testmode: form['action'] = 'http://workersandbox.mturk.com/mturk/externalSubmit' else: form['action'] = 'http://www.mturk.com/mturk/externalSubmit' if not form.has_key('onSubmit'): form['onSubmit'] = "return verifyTurkSubmit('myAssignmentId');" inputtag = Tag(soup,'input') inputtag['type'] = 'hidden' inputtag['name'] = 'assignmentId' inputtag['id'] = 'myAssignmentId' inputtag['value'] = '' form.insert(0, inputtag) mainurl = uploadfile(fn, str(soup)) for sub in soup.findAll('img'): # TODO fn = dirname(fn) + '/' + sub['src'] uploadfile(fn) return ExternalQuestion(escape(mainurl), frame_height)
def _parse_cdphotothread_image_partial(cls, html): """ Input: the HTML from the thread page ex: http://www.chiefdelphi.com/media/photos/38464, returns the url of the image in the thread ex: http://www.chiefdelphi.com/media/img/3f5/3f5db241521ae5f2636ff8460f277997_l.jpg """ html = html.decode("utf-8", "replace") # parse html for the image url soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) # 2014-07-15: CD doesn't properly escape the photo title, which breaks the find() for cdmLargePic element below # Fix by removing all instances of the photo title from the HTML photo_title = soup.find('div', {'id': 'cdm_single_photo_title'}).text cleaned_soup = BeautifulSoup(html.replace(photo_title, ''), convertEntities=BeautifulSoup.HTML_ENTITIES) element = cleaned_soup.find('a', {'target': 'cdmLargePic'}) if element is not None: partial_url = element['href'] else: return None # partial_url looks something like: "/media/img/774/774d98c80dcf656f2431b2e9186f161a_l.jpg" # we want "774/774d98c80dcf656f2431b2e9186f161a_l.jpg" image_partial = re.match(r'\/media\/img\/(.*)', partial_url) if image_partial is not None: return image_partial.group(1) else: return None
def getUrl(self, url, downloadtype = None): if not downloadtype: downloadtype = self.downloadtype self.errors = [] html = self.downloadHtml(url) if not html: return False soup = BeautifulSoup(html) manga = soup.find("object",{u"id":u"showmedia_videoplayer_object"}) if manga: manga = manga.find("embed",{u"type":u"application/x-shockwave-flash"}).get("flashvars").split("=") manga_titulo = soup.find(u"span", {u"itemprop":u"title"}).text #nopermitido = ["\\","/","?",":","*","\"","<",">","|"] #for i in nopermitido:manga_titulo = manga_titulo.replace(i,' ') manga_titulo = self.checkStr(manga_titulo) self.manga_titulo = manga_titulo n = len(manga)-1 serie_id = manga[1][:manga[1].find('&chapterNumber')] chapterNumber = self.numCap(manga[2][:manga[2].find('&server')]) if chapterNumber: self.chapterNumber = chapterNumber[0] sesion_id = manga[n] url_serie = "http://api-manga.crunchyroll.com/chapters?series_id="+serie_id if downloadtype == "Chapter": return self.Chapter(sesion_id,url_serie,manga_titulo,chapterNumber) elif downloadtype == "Volume": return self.Volume(soup) elif downloadtype == "Complete": pass else: self.addError("Error: Invalid download type.")
def _parse(self, html): soup = bs4.BeautifulSoup(html) print_link = soup.findAll('a', text='Print')[0].get('href') html2 = grab_url(print_link) logger.debug('got html 2') # Now we have to switch back to bs3. Hilarious. # and the labeled encoding is wrong, so force utf-8. soup = BeautifulSoup(html2, convertEntities=BeautifulSoup.HTML_ENTITIES, fromEncoding='utf-8') self.meta = soup.findAll('meta') p_tags = soup.findAll('p')[1:] real_p_tags = [p for p in p_tags if not p.findAll(attrs={'class':"twitter-follow-button"})] self.title = soup.find('strong').getText() entity = soup.find('span', attrs={'class':'author'}) children = list(entity.childGenerator()) try: self.byline = 'By ' + children[1].getText() except IndexError: self.byline = '' self.date = children[-1].strip() self.body = '\n'+'\n\n'.join([p.getText() for p in real_p_tags])
def getLinks(articleUrl): html = urlopen("http://en.wikipedia.org" + articleUrl) soup = BeautifulSoup(html) title = soup.find("h1").text content = soup.find('div', {'id':'mw-content-text'}).find('p').text store(title, content) return soup.find('div', {'id':'bodyContent'}).findAll('a', href=re.compile('^(\/wiki\/)((?!:).)*$'))
def login(self, username='', passwd='', app_user_nick=None, target=None, use_taobaoid=False): if use_taobaoid: systime = SysTime() params = { 'app_key' : self.API_KEY, 'timestamp' : systime.get(), 'sign_method' : self.SIGN_METHOD, } if app_user_nick!=None: params['app_user_nick'] = app_user_nick if target!=None: params['target'] = target src = self.APP_SECRET + ''.join(["%s%s" % (k, v) for k, v in sorted(params.iteritems())]) + self.APP_SECRET params['sign'] = md5(src).hexdigest().upper() form_data = urllib.urlencode(params) rsp = requests.get('%s?%s'%(self.TaobaoID_URL, form_data)) print rsp.content else: rsp = requests.get('%s%s'%(self.LOGIN_URL, self.API_KEY)) soup = BeautifulSoup(rsp.content) iframe_src = soup.find('iframe')['src'] rsp = requests.get(iframe_src) print rsp.url #s = requests.session() login_url = 'https://login.taobao.com/member/login.jhtml' soup = BeautifulSoup(rsp.content) login_url = soup.find('form')['action'] #inputs = soup.findAll('input') forms = self.extract_form_fields(soup) forms['TPL_username'] = username forms['TPL_password'] = passwd rsp = requests.post(login_url, data=forms) print rsp.url print rsp.content
def fslink_get_video_list(url, count): soup = BeautifulSoup(make_request(url), convertEntities=BeautifulSoup.HTML_ENTITIES) # items = soup.findAll('a', {'class' : 'title t'}) # for item in items: # print item # print item.a # print item.nextSibling().img('src') # add_dir(item.text.encode("utf-8").replace(' ',' '), FSLINK+item['href'], 8, icon) items = soup.find("div", {"class": "featured-view"}) for item in items.findAll("a"): try: add_dir(item.img["alt"], FSLINK + item["href"], 8, FSLINK + item.img["src"]) except: pass items = soup.find("div", {"class": "vm-pagination"}) for item in items.findAll("a"): try: if item.string == ">": if count < 0: count = count + 1 fslink_get_video_list(FSLINK + item["href"], count) else: add_dir(item.string, FSLINK + item["href"], 7, icon) except: pass
def play(url=common.args.url): swfUrl = 'http://admin.brightcove.com/viewer/us20110809.1526/federatedVideoUI/BrightcovePlayer.swf' exp_id=common.args.exp_id data = common.getURL(url) tree=BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES) key = tree.find('param',attrs={'name':'playerKey'})['value'] content_id = tree.find('param',attrs={'name':'@videoPlayer'})['value'] #key = re.compile('<param name="playerKey" value="(.+?)" />').findall(data)[0] #content_id = re.compile('<param name="@videoPlayer" value="(.+?)" />').findall(data)[0] #exp_id = re.compile('<param name="playerID" value="(.+?)" />').findall(data)[0] renditions = get_episode_info(key, content_id, url, exp_id)['programmedContent']['videoPlayer']['mediaDTO']['renditions'] rtmp = '' hi_res = 0 selected_video = None for video in renditions: if(int(video['size'])>hi_res): selected_video = video hi_res = int(video['size']) link = selected_video['defaultURL'] item = xbmcgui.ListItem(path=link) return xbmcplugin.setResolvedUrl(pluginhandle, True, item)
def get(self): content = self.request.content soup = BeautifulSoup(''.join(content)) #所有text已经被自动转为unicode,如果需要,可以自行转码encode(xxx) title = soup.html.body.h1 if not title: return title = title.text subtitle = soup.findAll(attrs={'class':'f_cy f_s16b'})[0].string description = soup.find(attrs={'class':'f_cy f_s14 pt20'}) description = description.text if description else '' smooth_index = soup.findAll(attrs={'class':'pt20'})[0] smooth_index = smooth_index.text if smooth_index else '' information = soup.findAll(attrs={'class':'pt20'})[1] information = information.text if information else '' tips = soup.find(attrs={'class':'f_s14 pt20'}) tips = tips.text + tips.nextSibling.nextSibling.text if tips else '' # pics = soup.findAll('a', href = re.compile(r'pic\d')) pics = soup.findAll(attrs={'class':'pic1'}) if pics: imageList = [] for pic in pics: img = pic.find('img')['src'] imageList.append(img) spider.put(HTTP%img) self.page.append((self.request.url, title, subtitle, description, smooth_index, information, tips, imageList))
def convertToSQL(line): try: soup = BeautifulSoup(line) name = soup.contents[0].a.string link = soup.find("a")["href"] try: target = soup.find("a")["target"] except: target = '' try: rel = soup.find("a")["rel"] except: rel = '' try: title = soup.find("a")["title"] except: title = '' sqlLine = 'INSERT into %s.wp_links (link_url, link_name, link_target, link_description, link_rel)' % database sqlLine += """ values ('%s','%s','%s','%s','%s');""" % (link, name, target, title, rel) print sqlLine except: pass
def index(url,name): if not name=='All Videos': if not re.search('page=', url): addPlaylist('Play Featured Videos',url,4,'') req = urllib2.Request(url) req.addheaders = [('Referer', 'http://www.nfl.com/'), ('Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3 ( .NET CLR 3.5.30729)')] response = urllib2.urlopen(req) link=response.read() response.close() soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES) videos = soup.find('ul', attrs={'id' : "video-list-items"})('li') for video in videos: name = video('h3')[0]('a')[0].string link = video('h3')[0]('a')[0]['href'].split('/')[3] thumb = video('img')[0]['src'].replace('_video_thumbnail_80_60.jpg','_video_rhr_280_210.jpg') try: desc = video('p')[1].string+' \n '+video('p')[0].string except: desc = video('p')[0].string duration = video('div')[-1].string.replace('\n','').replace('\t','') addLink(name,link,thumb,duration,desc,3) try: page = soup.find('div', attrs={'id' : "video-list-pagination"})('a')[-1]['href'] if not page == '?page=3': addDir('Next Page',url.split('?')[0]+page,1,next) else: addDir('Next Page','http://www.nfl.com/ajax/videos/v2?batchNum=1&channelId='+url.split('/')[-1].split('?')[0],5,next) except: pass
def folderDescription(self, folderUrl): #return 'description here' http = self.GET(folderUrl, httpSiteUrl) fullSoup = BeautifulSoup(http) itemInfo = fullSoup.find('div', 'item-info') if None == itemInfo: return '' plot = fullSoup.find('meta', attrs = {'name' : 'description'}) try: if plot != None: plot = plot['content'] if plot == None: plot = '' except: plot = '' detailsString = '' try: for pair in itemInfo.findAll('tr'): right = '' for r in pair.findAll('a'): right += r.string + ',' right = right.rstrip(',') detailsString += pair.find('td').string.strip() + " " + right + "\n" except: detailsString = '' description = detailsString + '\n\n' + plot return description.encode('utf-8')
def load_pol_pics(): for pol in Politician.objects.exclude(parlpage='').filter(models.Q(headshot__isnull=True) | models.Q(headshot='')): print "#%d: %s" % (pol.id, pol) print pol.parlpage soup = BeautifulSoup(urllib2.urlopen(pol.parlpage)) img = soup.find('img', id='MasterPage_MasterPage_BodyContent_PageContent_Content_TombstoneContent_TombstoneContent_ucHeaderMP_imgPhoto') if not img: img = soup.find('img', id="ctl00_cphContent_imgParliamentarianPicture") if not img: raise Exception("Didn't work for %s" % pol.parlpage) imgurl = img['src'] if '?' not in imgurl: # no query string imgurl = urllib.quote(imgurl.encode('utf8')) # but there might be accents! imgurl = urlparse.urljoin(pol.parlpage, imgurl) try: test = urllib2.urlopen(imgurl) content = urllib.urlretrieve(imgurl) except Exception, e: print "ERROR ON %s" % pol print e print imgurl continue #filename = urlparse.urlparse(imgurl).path.split('/')[-1] pol.headshot.save(str(pol.id) + ".jpg", File(open(content[0])), save=True) pol.save()
def scrape_wretmans(url, day=None): page = urlopen(url) soup = BeautifulSoup(page) page.close() if day == None: day = date.today().weekday() # No lunch on Saturday or Sunday if day == 5 or day == 6: return daily_specials # Modify all the strange <span class="SpellE"> and insert a space before the text for s in soup.findAll("span", {"class": "SpellE"}): soup.find(text=s.text).replaceWith(" " + s.text) day = [u"Måndag", u"Tisdag", u"Onsdag", u"Torsdag", u"Fredag"][day] anchor = soup.find(lambda tag: tag.name == "p" and re.match(tag.text, day)) siblings = anchor.findNextSiblings("p", limit=2) specials = [] for i, s in enumerate([s.text for s in siblings]): s = re.sub("\n", " ", s) if s[0:6] == " ": specials.append(s[6:]) else: specials.append(s) return filter(len, specials)
def parse(self, response): resp = response.body soup = BeautifulSoup(resp) try: post_id = int(re.search(r'topicid="(.*?)";', str(soup)).group(1)) except: raise UnknownResponseError headcode_span = soup.find("span", {"id": "stockheadercode"}) stock_id = headcode_span.find("a").string content = soup.find('div', {'class':'stockcodec'}).text title = soup.find('div', {'id': 'zwconttbt'}).text releaseTimePara = re.search(r'发表于 (.*?) (.*?) ', str(soup.find('div', {'class': 'zwfbtime'}))) part1 = releaseTimePara.group(1).decode('utf-8') part2 = releaseTimePara.group(2).decode('utf-8') releaseTime = part1 + ' ' + part2 lastReplyTime = None zwlitxb_divs = soup.findAll('div', {'class': 'zwlitime'}) if len(zwlitxb_divs): lastReplyTime = re.search(r'发表于 (.*?)<', str(zwlitxb_divs[0])).group(1).decode('utf-8').replace(' ', ' ') item_dict = {'post_id': post_id, 'content': content, 'releaseTime': releaseTime, 'lastReplyTime': lastReplyTime, \ 'stock_id': stock_id, 'title': title} item = GubaPostDetailItem() for key in GubaPostDetailItem.RESP_ITER_KEYS: item[key] = item_dict[key] return item
def _product_urls_and_types(cls, product_types): product_links = [] if 'Store' not in product_types: return [] browser = mechanize.Browser() main_urls = [ cls.base_url + '/tiendas/', cls.base_url + '/mirador-del-alto/', ] for main_url in main_urls: soup = BeautifulSoup(browser.open(main_url)) categories = soup.find('ul', 'optciones_cat').findAll('li') for category_element in categories: category_url = cls.base_url + category_element.find('a')['href'] category_soup = BeautifulSoup(browser.open(category_url)) store_options = category_soup.find('ul', 'optciones_sto').findAll('li') for option in store_options: url = cls.base_url + option.find('a')['href'] product_links.append([url, 'Store']) return product_links
def scrape_councillor(url, record): record["URL"] = "http://www.winnipeg.ca/council/" + url soup = BeautifulSoup(scraperwiki.scrape(record["URL"])) # strip all HTML comments from the page. comments = soup.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] tables = soup.find("div", {"id" : "content"}).findAll("table") img = soup.find("img", {"class" : "bio_pic"}) if img: record["Image"] = "http://www.winnipeg.ca" + img["src"] name = soup.find("span", {"class" : "bg90B"}) record["Name"] = name.text.replace('Councillor','').strip() # table = soup.find(text="Ward Information").findParent('table').find("table", { "width" : "100%" }) table = tables[0] key = '' value = '' # Could be improved to add spaces within the addresses. for row in table.findAll("tr"): cols = row.findAll("td") k = cols[0].text.strip().replace(':','') if len(k) > 0: add_record_value(record, key, value) key = k value = '' if len(cols) > 1: value += cols[1].text.strip() + '\n' add_record_value(record, key, value)
def mobileUA(content): soup = BeautifulSoup(content, convertEntities=BeautifulSoup.HTML_ENTITIES) res = soup.find('html') res = res.get('class', '') if res else '' return True if 'a-mobile' in res or 'a-tablet' in res else False
import mechanize import re import scraperwiki from BeautifulSoup import BeautifulSoup br = mechanize.Browser() br.open("http://www.business.detini.gov.uk/iva_register/IVASearch.aspx") br.select_form(name="aspnetForm") # br['ctl00$ContentPlaceHolder1$Surname']= "McMullan" # comment to search all response = br.submit() #scraperwiki.sqlite.save('data_columns', ['Surname', 'Forename', 'DOB', 'Address', 'Postcode', 'latlng']) soup = BeautifulSoup(response) tds = soup.find("table", {"id": "ctl00_ContentPlaceHolder1_GridView1"}) rows = tds.findAll("tr") for row in rows: record = {} table_cells = row.findAll("td") if table_cells: surname = re.sub( "(\(.*)|( nee.*)", '', (re.sub("(\s[a|A]ka.*)|(\sformerly.*)", '', table_cells[0].text))) surname = re.sub("\s", '', surname) record['Surname'] = surname record['Forename'] = table_cells[1].text dob = re.sub(' ', "Unknown", table_cells[2].text) record['DOB'] = dob address = table_cells[3].text
import time import scraperwiki import re from BeautifulSoup import BeautifulSoup page = 4300 while page <= 9500: url = 'http://www.w4mp.org/html/personnel/jobs/disp_job_text.asp?ref=%s' % page html = scraperwiki.scrape(url) soup = BeautifulSoup(html) table = soup.find('table').find('table') def soup_strip_html(st): return ''.join([e.strip() for e in st.recursiveChildGenerator() if isinstance(e,unicode)]) def strip_html(st): tags = re.compile(r'<.*?>') return tags.sub('',st) if table: data = {} data['url'] = url row_count = 0 for row in table.findAll('tr'): cells = row.findAll('td') if row_count == 1: key = 'title' data[key] = soup_strip_html(cells[0]) else: if cells[0].string != ' ' and cells[1].string != ' ':
from mechanize import Browser from BeautifulSoup import BeautifulSoup import scraperwiki from scraperwiki import sqlite mech = Browser() url = 'http://www.gpupdate.net/en/standings/192/2013-moto3-standings/' page = mech.open(url) html = page.read() soup = BeautifulSoup(html) resContainer = soup.find("div", {"id": "middle_container"}) rownumber = 0 table = soup.find("table") for row in table.findAll('tr')[1:40]: col = row.findAll('td') pos = int(col[0].string.replace(".", "")) driver = col[1].a.string tempTD = col[1] team = tempTD.findAll('span') team = team[1].string points = col[2].string country = tempTD.findAll('img') country = country[0]['alt'].upper()
re_rating = re.compile("reviewer-rating") """ Url settings for scraping """ product_id = 10002735 review_max = 10 """ Scraping """ # generating first review's url base_url = "http://www.cosme.net/product/product_id/%d/reviews" % product_id soup = BeautifulSoup(scraperwiki.scrape(base_url).decode('sjis', "ignore")) div = soup.find(attrs={"class": "review-sec"}) a = soup.find('a', attrs={"class": 'cmn-viewmore'}) review_url = str(a.attrMap['href']) # getting reviews iteratively review_count = 1 while 1: # scraping review's data print "getting review: %d, url: %s" % (review_count, review_url) review_page_soup = BeautifulSoup( scraperwiki.scrape(review_url).decode('sjis', "ignore")) div = review_page_soup.find(attrs={"class": "review-sec"}) data = {} data['product_id'] = product_id
" value=%s, labels=%s" % (str(item), [label.text for label in item.get_labels()])) print "\n".join(r) print br.title() br.select_form(name='provider_search_1') br.form['searchType'] = ['43'] #43=Ysgolion cynradd # value=56, labels=['Ysgolion uwchradd'] # value=57, labels=['Ysgolion arbennig'] # value=25, labels=['Meithrinfeydd a gynhelir'] br.response base_url = 'http://www.estyn.gov.uk/cymraeg/gwybodaeth-am-arolygiadau/adroddiadau-arolygu/' print br.submit() soup = BeautifulSoup(br.response().read()) tud_nesa = soup.find('a', {"class": "next"}) next_link = tud_nesa['href'] next_url = base_url + next_link #atags = soup.findAll('a') #print atags #for atag_inst in atags: # atag = atag_inst.find(text=re.compile("Next")) # if atag: # next_link = atag_inst['href'] print next_link # if next_link: # next_url = base_url + next_link['href'] print next_url # scrape_and_look_for_next_link(next_url) #soup = BeautifulSoup(br.response().read())
import requests from BeautifulSoup import BeautifulSoup import shutil import os, sys, errno import urlparse from simplejson import loads, dumps print "starting..." url = 'http://archillect.com' response = requests.get(url) html = response.content soup = BeautifulSoup(html) container = soup.find('div', attrs={'id': 'container'}) try: state = loads( open(os.path.join(os.path.dirname(__file__), 'scrape.state'), 'r').read()) except IOError: state = {} if 'last_image' not in state: state['last_image'] = '' # create the output images folder if it doesn't exist def ensure_dir(directory): if not os.path.exists(directory):
import scraperwiki from BeautifulSoup import BeautifulSoup import re # retrieve a page starting_url = 'http://finance.yahoo.com/q/ks?s=GE+Key+Statistics' html = scraperwiki.scrape(starting_url) soup = BeautifulSoup(html) #mytable = soup.findAll(id="yfncsumtab") #mysubtable = mytable.findAll('table') #print mysubtable ForwardPEValue = soup.find(text=re.compile("Forward P/E")).findNext( 'td' ).text # Nahodit frazu "Forward P/E" i vydaet znachenie v sleduyshei yacheike #ili po drugomu: #MarketCap = soup.find(text=re.compile("Market Cap")) #MarketCapTag = MarketCap.findNext('td').text record = {soup.find(text=re.compile("Forward P/E")): ForwardPEValue} scraperwiki.datastore.save([soup.find(text=re.compile("Forward P/E"))], record) #mytable = soup('table',limit =10)[9] #Otkryvaet 9-u po scetu tablicu na stranice #tds = mytable.findAll('td') #for td in tds: # print td #print mytable.prettify() #print mytable('tr',limit = 3)[2].prettify()
def process(folder): indexfile = open(os.path.join(folder, 'index.html'), 'rb') try: soup = BS(indexfile.read()) finally: indexfile.close() styles = [x['href'] for x in soup.findAll('link')] soup.find('head').contents = BS(head(styles)) try: soup.find( 'h1').contents = BS('{{=response.title or request.application}}') soup.find('h2').contents = BS( "{{=response.subtitle or '=response.subtitle'}}") except: pass for match in (soup.find('div', id='menu'), soup.find('div', {'class': 'menu'}), soup.find('div', id='nav'), soup.find('div', {'class': 'nav'})): if match: match.contents = BS('{{=MENU(response.menu)}}') break done = False for match in (soup.find('div', id='content'), soup.find('div', {'class': 'content'}), soup.find('div', id='main'), soup.find('div', {'class': 'main'})): if match: match.contents = BS(content()) done = True break if done: page = soup.prettify() page = re.compile("\s*\{\{=response\.flash or ''\}\}\s*", re.MULTILINE)\ .sub("{{=response.flash or ''}}", page) print page else: raise Exception("Unable to convert")
filename = asciichars(oddchars(title)) + ".ogg" # upload filename # Quick check of filename in use - these should be unique if nameused(filename): print Fore.RED + 'Filename found', Fore.YELLOW + "http://commons.wikimedia.org/wiki/File:" + re.sub( "%20", "_", urllib.quote(filename)), Fore.WHITE continue localfile = workingdir + ref + ".mp3" # source mp3 file localenc = workingdir + ref print Fore.GREEN + filename + Fore.WHITE source = r['file'] artist = r['rec'] gallery = r['url'] url = urltry(gallery) html = htmltry(url, gallery) soup = BeautifulSoup(html) rd = str(soup.find('section', {'id': 'recording-data'}).find('tbody')) date = rd.split(">Date<")[1].split('<td>')[1].split('<')[0] dtime = "" if re.search(">Time<", rd): dtime = rd.split(">Time<")[1].split('<td>')[1].split('<')[0] if len(dtime) > 2: date += " " + dtime elevation = '' if re.search('>Elevation<', rd): elevation = rd.split(">Elevation<")[1].split('<td>')[1].split('<')[0] background = '' if re.search('>Background<', rd): background = rd.split(">Background")[1].split('<td')[1].split( '>')[1].split('<')[0] if background == "none": background = ''
def download_url(filename): """""" words = {} request = httplib2.Http() if not config.url_pattern: raise config.ConfigException url = config.url_pattern % (filename.replace(' ', '')) try: response, content = request.request(url) except exception.WebException, e: raise if response.status == 200: try: soup = BeautifulSoup(content) song_list = soup.find(monkey="song-list") if song_list: html_tags = song_list.findAll("span", {"class": "song-title"}) html_tags.extend(song_list.findAll("span", {"class": "singer"})) words = [tag.text for tag in html_tags] except Exception, e: raise return words def parse(filename): """""" wordcount = {} try:
def main(): urls = [] urls.insert( 1, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Buick&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 2, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Acura&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 3, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Chrysler&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 4, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Chrysler&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 5, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Dodge&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 6, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Dodge&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 7, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Dodge&srchtyp=newMake&pageno=3&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 8, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=GMC&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 9, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=GMC&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 10, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=GMC&srchtyp=newMake&pageno=3&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 11, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Jeep&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 12, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Jeep&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 13, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Jaguar&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 14, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Jaguar&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 15, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Land+Rover&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 16, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Lexus&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 17, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Lincoln&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 18, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mazda&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 19, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mazda&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 20, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mercury&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 21, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Saab&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 22, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Subaru&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 23, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Suzuki&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 24, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=BMW&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 25, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=BMW&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 26, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=BMW&srchtyp=newMake&pageno=3&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 27, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=BMW&srchtyp=newMake&pageno=4&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 28, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=BMW&srchtyp=newMake&pageno=5&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 29, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=BMW&srchtyp=newMake&pageno=6&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 30, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Cadillac&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 31, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Cadillac&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 32, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Chevrolet&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&tabView=0&rowLimit=200" ) urls.insert( 33, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Chevrolet&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&tabView=0&rowLimit=200" ) urls.insert( 34, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Chevrolet&srchtyp=newMake&pageno=4&sortBy=Comb&tabView=0&tabView=0&rowLimit=200" ) urls.insert( 35, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Chevrolet&srchtyp=newMake&pageno=5&sortBy=Comb&tabView=0&tabView=0&rowLimit=200" ) urls.insert( 36, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Chevrolet&srchtyp=newMake&pageno=6&sortBy=Comb&tabView=0&tabView=0&rowLimit=200" ) urls.insert( 37, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Chevrolet&srchtyp=newMake&pageno=7&sortBy=Comb&tabView=0&tabView=0&rowLimit=200" ) urls.insert( 38, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Ford&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 39, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Ford&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 40, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Ford&srchtyp=newMake&pageno=3&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 41, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Ford&srchtyp=newMake&pageno=4&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 42, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Ford&srchtyp=newMake&pageno=5&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 43, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Honda&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 44, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Honda&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 45, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Hyundai&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 46, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Hyundai&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 47, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Hyundai&srchtyp=newMake&pageno=3&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 48, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Infiniti&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 49, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Infiniti&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 50, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Kia&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 51, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Kia&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 52, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mercedes-Benz&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 53, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mercedes-Benz&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 54, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mercedes-Benz&srchtyp=newMake&pageno=3&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 55, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mercedes-Benz&srchtyp=newMake&pageno=4&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 56, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mercedes-Benz&srchtyp=newMake&pageno=5&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 57, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=MINI&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 58, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=MINI&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 59, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mitsubishi&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=100" ) urls.insert( 60, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mitsubishi&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=100" ) urls.insert( 61, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Nissan&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 62, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Nissan&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 63, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Nissan&srchtyp=newMake&pageno=3&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 64, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Nissan&srchtyp=newMake&pageno=4&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 65, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Scion&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 66, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Smart&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&tabView=0&rowLimit=200" ) urls.insert( 67, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Toyota&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 68, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Toyota&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 69, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Toyota&srchtyp=newMake&pageno=3&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 70, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Toyota&srchtyp=newMake&pageno=4&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 71, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Volvo&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200" ) urls.insert( 72, "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Volvo&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200" ) tablehead = 'CarModel,ImgUrl,MPG,UserEco\n' # EngineDisplacement,Transmission,FuelType, while len(urls) > 0: try: htmltext = '' htmltext = urllib.urlopen(urls[0]).read() except: print urls[0] soup = BeautifulSoup(htmltext) urls.pop(0) table = soup.find( 'table', attrs={'class': 'cars display responsive stickyHeader'}) for table_body in table.findAll('tbody'): rowNum = 0 if table_body: rows = table_body.findAll('tr') for row in rows: if rowNum == 0: model = row.find('td').find('a', href=True) #detail = row.find('span', attrs={'class': 'config'}) if model: tablehead = tablehead + ' ' + model.text # + ' ' + detail.text if row.find('td', attrs={'class': 'mpg-epa'}): imgurl = row.find('td', attrs={ 'class': 'vphoto' }).find('img', attrs={ 'class': 'img-thumbnail img-responsive veh-photo' }) fuelecoTable = '' try: fuelecoTable = row.find( 'td', attrs={ 'class': 'mpg-epa' }).find('div', attrs={ 'class': 'panel panel-default mpg-border' }).find('div', attrs={ 'class': 'panel-body' }).find('table', attrs={ 'class': 'results' }).findAll('tbody') fueleco = '' for fuelecorow in fuelecoTable: fueleco = fuelecorow.find( 'td', attrs={'class': 'mpg-comb'}) break except: pass finally: if fuelecoTable is not '': usereco = row.find('td', attrs={'class': 'mpg-user'}) tablehead = tablehead + ',' + str( imgurl['src']) tablehead = tablehead + ',' + str(fueleco.text) tablehead = tablehead + ',' + str(usereco.text) tablehead = tablehead + '\n' rowNum += 1 #tablehead = tablehead + '\n' tablehead = tablehead + '\n' f = open('Output.csv', 'w') f.write(str(tablehead)) print tablehead
def get_answers(self, question): r = self.session.get('http://%s/questions/%s' % (self.site, question)) q = BeautifulSoup(r.text).find(attrs={'class': 'question'}) a = BeautifulSoup(r.text).find(attrs={'id': 'answers'}) qd = {} ad = [] qd['title'] = BeautifulSoup(r.text).find(attrs={ 'id': 'question-header' }).find('h1').text qd['votes'] = q.find(attrs={'class': 'vote-count-post '}).text qd['favos'] = q.find(attrs={'class': 'favoritecount'}).text qd['text'] = html2md( str( BeautifulSoup(r.text).find(attrs={ 'class': 'post-text' }).extract())) qd['tags'] = [t.text for t in q.findAll(attrs={'class': 'post-tag'})], qd['users'] = [] u = q.find(attrs={'class': 'post-signature owner'}) for u in u.findAll(attrs={'class': 'user-info '}): avatar = u.find(attrs={'class': 'user-gravatar32'}).find('img') username = u.find(attrs={'class': 'user-details'}).find('a') reputation = u.find(attrs={'class': 'reputation-score'}) qd['users'].append({ 'owner': True, 'last_edit': u.find(attrs={ 'class': 'user-action-time' }).find('span')['title'], 'avatar': avatar['src'] if avatar else "", 'username': username.text if username else "", 'reputation': reputation.text if reputation else "" }) for u in q.findAll(attrs={'class': 'post-signature'}): for u in u.findAll(attrs={'class': 'user-info '}): avatar = u.find(attrs={'class': 'user-gravatar32'}).find('img') username = u.find(attrs={'class': 'user-details'}).find('a') reputation = u.find(attrs={'class': 'reputation-score'}) qd['users'].append({ 'last_edit': u.find(attrs={ 'class': 'user-action-time' }).find('span')['title'], 'avatar': avatar['src'] if avatar else "", 'username': username.text if username else "", 'reputation': reputation.text if reputation else "" }) qd['comments'] = [] for c in q.findAll(attrs={'class': 'comment'}): qd['comments'].append({ 'id': c['id'].split('-')[-1], 'score': c.find(attrs={ 'class': 'comment-score' }).text, 'text': c.find(attrs={ 'class': 'comment-copy' }).text, 'user': c.find(attrs={ 'class': 'comment-user' }).text, 'last_edit': c.find(attrs={ 'class': 'comment-date' }).find('span')['title'], }) for aa in a.findAll( attrs={'class': re.compile(r'^answer( accepted-answer)?$')}): aad = { 'id': aa['id'].split('-')[-1], 'accepted': 'accepted-answer' in aa['class'], 'votes': aa.find(attrs={ 'class': 'vote-count-post ' }).text, 'text': html2md(str(aa.find(attrs={'class': 'post-text'}))), 'users': [], 'comments': [] } for u in aa.findAll(attrs={'class': 'post-signature'}): avatar = u.find(attrs={'class': 'user-gravatar32'}).find('img') username = u.find(attrs={'class': 'user-details'}).find('a') reputation = u.find(attrs={'class': 'reputation-score'}) aad['users'].append({ 'last_edit': u.find(attrs={ 'class': 'user-action-time' }).find('span')['title'], 'avatar': avatar['src'] if avatar else "", 'username': username.text if username else "", 'reputation': reputation.text if reputation else "" }) for c in aa.findAll(attrs={'class': 'comment'}): user = c.find(attrs={'class': 'comment-user'}) aad['comments'].append({ 'id': c['id'].split('-')[-1], 'score': c.find(attrs={ 'class': 'comment-score' }).text, 'text': c.find(attrs={ 'class': 'comment-copy' }).text, 'user': user.text if user else "", 'last_edit': c.find(attrs={ 'class': 'comment-date' }).find('span')['title'], }) ad.append(aad) return dict(question=qd, answers=ad)
############################################################################### # Dun Laoghaire Harbour - CURRENT HARBOUR WEATHER ############################################################################### import scraperwiki import mechanize from BeautifulSoup import BeautifulSoup # retrieve a page starting_url = 'http://www.dlharbour.ie/weather/index.php' html = scraperwiki.scrape(starting_url) #print html soup = BeautifulSoup(html) wdata = soup.find('div', { 'class' : 'wdata' }) print wdata record = {} lis = wdata.findAll('li') for li in lis: val = li.text.split(':', 1) if len(val) > 1: record[val[0]] = val[1] print record # save records to the datastore scraperwiki.sqlite.save(['Date', 'Time'], record) ############################################################################### # Dun Laoghaire Harbour - CURRENT HARBOUR WEATHER ############################################################################### import scraperwiki
def _get_content(self, div_id): soup = BeautifulSoup("".join(self.htmlsourse)) self.data = str(soup.find("div", {"id": div_id}))
def poll_for_polyphen2_results(sid): """ Polls PolyPhen2's GGI web interface for updates on the progress of the job. Once the job has completed the full result file is returned. """ curr_step = -1 max_tries = 10 tries = 0 wait_msg = "Waiting for PolyPhen2 results => %s" done_msg = " => Done.\n" while True: params = urllib.urlencode({ '_ggi_project': 'PPHWeb2', '_ggi_origin': 'manage', '_ggi_target_manage': 'Refresh', 'sid': sid }) doc = None while doc is None: try: response = urllib2.urlopen(pph2_url, params) doc = response.read() except (socket.timeout, IOError): pass soup = BeautifulSoup(doc) status_td = soup.find('td', text=re.compile(r'^Batch \d+:')) if status_td is None: # We might be done, make sure this page is not an error page if soup.find('b', text=re.compile(r'^Service Name:')): break else: tries += 1 if tries >= max_tries: raise RemoteException( 'PolyPhen won\'t let us check the status right now.') spin(15) continue pos_td = status_td.parent.parent.findAll('td')[1] try: pos = int(pos_td.string) except ValueError: pos = 0 shortened = re.sub(r'^Batch \d+:\s+', '', str(status_td)) this_step = steps.index(shortened) if curr_step != this_step: if curr_step is not -1: write_status((wait_msg + done_msg) % steps[curr_step], True) curr_step += 1 while curr_step < this_step: # Write out steps that were completed between refreshes. write_status((wait_msg + done_msg) % steps[curr_step]) curr_step += 1 maxpos = pos write_status(wait_msg % shortened, maxpos - pos, maxpos) spin(15) if curr_step != -1: write_status((wait_msg + done_msg) % steps[curr_step], True) curr_step += 1 while curr_step < len( steps): # Write out steps that were completed before last refresh. write_status((wait_msg + done_msg) % steps[curr_step]) curr_step += 1 result_url = pph2_result_url % sid while True: error = False try: write_status( "Waiting for PolyPhen2 results => Waiting for download", True) response = urllib2.urlopen(result_url) result = response.read() if result: break except (socket.timeout, IOError): spin(15) if error: raise RemoteException(error.split("\n")[0]) write_status(True) return result
def getDiscurso(url): html = unicode(scraperwiki.scrape(url), 'utf-8', 'ignore') soup = BeautifulSoup(html, fromEncoding='utf-8') soup = soup.find('div', { 'id' : 'content' }) soup = soup.find('p', { 'align' : 'justify' }) return soup.renderContents()
def main(bot, args): '''Ответить слушателю. Параметры: <user_id> <message> Если в качестве user_id указать восклицательный знак, сообщение будет выглядеть как объявление. ? user_id — заблеклистить юзера user_id, его сообщения перестанут поступать в диджейку. ?? — показать блеклист. ?! — очистить блеклист.''' syl = { '0': 'be', '1': 'sa', '2': 'ko', '3': 'pa', '4': 're', '5': 'du', '6': 'ma', '7': 'ne', '8': 'wa', '9': 'si', 'a': 'to', 'b': 'za', 'c': 'mi', 'd': 'ka', 'e': 'ga', 'f': 'no' } salt = bot.settings["ans_salt"] message_limit = 250 userpost = "" if len(args) == 1 and args[0] != "??" and args[0] != "?!" or not len(args): return blacklisting = False if args[0] != "!": if args[0] == "??": return _("blacklist:\n%s") % "\n".join(bot.blacklist) if args[0] == "?!": bot.blacklist = [] return _("blacklist cleared.") if args[0] == "?": blacklisting = True del args[0] if len(args[0]) != 12: return _("incorrect name entered, should be 12 symbols.") check = md5() check.update(args[0][:8].encode('utf-8') + salt) if check.hexdigest()[:4] != args[0][8:12]: return _("incorrect name entered (checksum invalid).") if blacklisting: bot.blacklist.append(args[0]) return _("%s was added to blacklist.") % args[0] to = ">>" + args[0] if args[0] in bot.usersposts: userpost = "<span class=\"userpost\">> " + escape( bot.usersposts[args[0]]) + "</span><br/>" else: to = "!" message = " ".join(args[1:]) if len(message) > message_limit: return _( "too long answer, should be less than %d symbols, you entered %d symbols." ) % (message_limit, len(message)) soup = BeautifulSoup(open(bot.settings["ans_file"], "r")) posts = soup.findAll('p') new_post = Tag(soup, 'p') user_id = Tag(soup, 'span', [('id', 'user_id')]) if to != "!": user_id.insert(0, escape(to)) else: user_id.insert(0, "<b>>>ОБЪЯВЛЕНИЕ<<</b>") new_post.insert( 0, '[' + datetime.datetime.strftime(datetime.datetime.now(), "%H:%M:%S") + ']') new_post.insert(1, user_id) message = re.sub( r'\[([^]]*)\]', lambda x: '<a href="' + x.group(1).replace( "&", "&") + '" target="_blank">' + x.group(1) + '</a>', escape(message)) message = re.sub( r'\{([^}]*)\}', lambda x: '<a href="' + x.group(1).replace("&", "&") + '" target="_blank"><img style="max-width: 200px; max-height: 200px;display: inline;" src="' + x.group(1).replace("&", "&") + '"/></a>', message) new_post.insert(2, userpost + message) if len(posts) > 0: posts[0].parent.insert(2, new_post) else: soup.find('h1').parent.insert(1, new_post) if len(posts) > 9: posts[len(posts) - 1].extract() f = open(bot.settings["ans_file"], "w") f.write(soup.prettify()) f.close() return _("sent.")
'[Errors when replying to ads?]')[2].partition( 'START CLTAGS')[0].partition( '<!-- imgList = new Array')[0] # TODO get address # Finally, throw the listing into data[] data.append(listing) # Deal with CL's occasional timeouts if i % 10 == 0: print "sleeping for 10 seconds..." sleep(10) # Retrieve the URL for the next page nexturl = soup.find(text=re.compile("next 100 postings")) # Set next crawl URL if nexturl is not None: crawlurl = rooturl + nexturl.parent['href'] else: crawlurl = "" print data # Write data to database scraperwiki.sqlite.execute(""" DROP TABLE IF EXISTS `cl_sc_commercial_space` """) scraperwiki.sqlite.execute("""
print "not found" continue a_url = u_base + href time.sleep(random.random()) result = requests.get(a_url) if result.status_code != 200: print author, response.reason break soup2 = BS(result.content) data = [] if not soup2: print "no content found" continue table = soup2.find('table', attrs={'id': 'gsc_rsb_st'}) if not table: print "content not found" continue table_body = table.find('tbody') rows = table_body.findAll('tr') for row in rows: cols = row.findAll('td') cols = [ele.text.strip() for ele in cols] data.append([ele for ele in cols if ele]) write_row += "{},{},{},{},".format(data[1][1], data[1][2], data[2][1], data[2][2]) domains = [] for text in soup2.findAll(attrs={ 'class': 'gsc_prf_il',
def getEventDetails(self, eventID): """ Return event details for a given event ID from sherdog.com's fightfinder. Arguments: eventID -- A String containing the event's numeric event ID from sherdog.com Returns: eventDetails -- A dictionary containing the events details as scraped from sherdog.com. eventDetails keys: ID -- Event's ID title -- Event's full title promotion -- Promotion which ran the event date -- Date of event (YYYY-MM-DD) venue -- Event's venue city -- City in which event took place fights -- A list containing dictionaries (fightDetails[]) with the details of each fight on the event fightDetails keys: ID -- Fight's ID fighter1 -- Sherdog ID for the first fighter fighter2 -- Sherdog ID for the second fighter winner -- Sherdog ID for the winning fighter result -- Method of victory/Type of decision referee -- Referee that presided over the fight round -- Round in which fight ended time -- Time at which final round ended """ # initialise empty dict to store event details eventDetails = {} # store event ID in dict eventDetails['ID'] = eventID # generate event url url = self.__eventURL__ % eventID # retrieve html and initialise beautifulsoup object for parsing soup = BeautifulSoup(self.getHtml(url)) pageTitle = soup.html.head.title.string pageTitleArr = pageTitle.split(' - ', 1) # find and store event title in dict eventDetails['title'] = pageTitle # find and store promotion name in dict eventDetails['promotion'] = pageTitleArr[0] # find events date tempDate = soup.find("div", {"class" : "authors_info"}).find("span", {"class" : "date"}).string # store event date in dict eventDetails['date'] = datetime.datetime.strptime(tempDate, '%b %d, %Y') eventTemp = '' try: # find and store venue in dict eventTemp = soup.find("span", {"class" : "author"}).findAll(text=True)[0].split("\r\n") eventDetails['venue'] = eventTemp[0].lstrip().rstrip(",") except: # store blank string if no venue listed eventDetails['venue'] = '' try: # find and store city in dict eventDetails['city'] = eventTemp[1].lstrip().rstrip() except: # store blank string if no city listed eventDetails['city'] = '' # find list of fights for event table = soup.find("div", {"class" : "module_fight_card"}) # initialise empty list to store fightDetails dicts eventDetails['fights'] = [] fightDetails = {} fights = [] fightDetails['fighter1'] = soup.find("div", {"class" : "fighter left_side"}).a['href'].rsplit("-", 1)[1] fightDetails['fighter2'] = soup.find("div", {"class" : "fighter right_side"}).a['href'].rsplit("-", 1)[1] leftResult = '' rightResult = '' winner = '' leftResult = soup.find("div", {"class" : "fighter left_side"}).find("span", {"class" : "final_result win"}) rightResult = soup.find("div", {"class" : "fighter right_side"}).find("span", {"class" : "final_result win"}) if leftResult != None and leftResult.string == 'win': fightDetails['winner'] = fightDetails["fighter1"] if rightResult != None and leftResult.string == 'win': fightDetails['winner'] = fightDetails["fighter2"] tempCells = soup.find("table", {"class" : "resume"}).findAll("td") fightDetails['ID'] = int(tempCells[0].findAll(text=True)[1].strip()) fightDetails['result'] = tempCells[1].findAll(text=True)[1].strip() fightDetails['referee'] = tempCells[2].findAll(text=True)[1].strip() fightDetails['round'] = tempCells[3].findAll(text=True)[1].strip() fightDetails['time'] = tempCells[4].findAll(text=True)[1].strip() fights.append(fightDetails) # find all rows in the fights table rows = soup.find("div", {"class" : "content table"}).findAll("tr") # set rowcount to 0 rowcount = 0 # loop through all rows in fights table for row in rows: # ignore first row in table if not rowcount == 0: # find all columns in table cols = row.findAll('td') # initialise empty dict to store fight details fightDetails = {} # find and store fight ID fightDetails['ID'] = int(cols[0].string) # find and store ID for fighter1 fightDetails['fighter1'] = cols[1].a['href'].rsplit('-', 1)[1] # find and store ID for fighter2 fightDetails['fighter2'] = cols[5].a['href'].rsplit('-', 1)[1] # check that fight was not a draw win = cols[1].find("span").find(text=True) if win == 'win': # find and store winner ID fightDetails['winner'] = fightDetails['fighter1'] else: # store blank string if no winner fightDetails['winner'] = '' # find and store result fightDetails['result'] = cols[6].find(text=True).string # find and store round in which fight ended fightDetails['referee'] = cols[6].find("span").string # find and store round in which fight ended fightDetails['round'] = cols[7].string # find and store end time of fight fightDetails['time'] = cols[8].string # add fightDetails dict to fights list fights.append(fightDetails) # increase rowcount by 1 rowcount = rowcount + 1 sort_on = "ID" sortFights = [(dict_[sort_on], dict_) for dict_ in fights] sortFights.sort() eventDetails['fights'] = [dict_ for (key, dict_) in sortFights] # return the scraped details return eventDetails
for s in symbols: #First form URL urltext = 'http://www.bloomberg.com/quote/' + s # print urltext # Open URL ########## url = urllib2.urlopen(urltext) soup = BeautifulSoup(url) # 52w high and low ################## table = soup.find('table', 'snapshot_table') row = table.find('tr', 'bottom') cells = row.findChildren(['th', 'td']) numbers = cells[3].text wlow, trail = numbers.split(" ", 1) whigh = re.search(r'[\d,.]+$', trail).group(0) # Symbol lookup ############### tag = soup.find('h3') # Price lookup ############### tagprice_currency = soup.find('span', {'class': ' price'}).text # print tagprice_currency
def zamandaily(url): with open(url) as f: content = f.readlines() a = "" for line in content: a += line + " " soup = BeautifulSoup(a) output = "{\"source\": \"" + "Zaman Daily" + "\",\n" output += "\"url\": \"" + "http://www.todayszaman.com/diplomacy_ihh-icc-finds-israel-guilty-of-war-crimes-in-mavi-marmara-raid_363650.html" + "\",\n" output += "\"title\": " a = soup.find("title") output += "\"" + a.text + "\",\n" b = soup.find("div", {"class": "topDate"}) #November 30, 2014, Sunday date = b.text arr = date.split(" ") month = arr[0] day = arr[1].strip(",") year = arr[2].strip(",")[2:] if month == "January": m = 1 elif month == "February": m = 2 elif month == "March": m = 3 elif month == "April": m = 4 elif month == "May": m = 5 elif month == "June": m = 6 elif month == "July": m = 7 elif month == "August": m = 8 elif month == "September": m = 9 elif month == "October": m = 10 elif month == "November": m = 11 elif month == "December": m = 12 d = str(m) + "/" + day + "/" + year output += "\"date\": \"" + d + "\",\n" output += "\"article_text\": \"" c = soup.findAll("p") for paragraph in c: try: output += paragraph.text.decode("utf-8").replace( "\"", "'").replace("^M", "").replace("\n", "").replace( "\\", "").replace(r'\r', '').replace("\r", "").replace( "\n", "") + " " except UnicodeEncodeError: pass output += "\"}\n" # output = output.replace("\"","") return ''.join([i if ord(i) < 128 else ' ' for i in output])
# get total number of rows print("Total no. of rows: %d" % (csvreader.line_num)) for row in rows[:]: #link print(len(row)) links.append(row[0]) address.append(row[1]) name.append(row[2]) driver.get(row[0]) content = driver.page_source soup = BeautifulSoup(content) a = soup.find('div', attrs={'class': 'rv_highlights__section pr10'}) if a is not None: f = a.find('div', attrs={'class': 'fontsize13 ln18'}) #print(f) food.append(f.text) elif a is None: food.append("") df = pd.DataFrame({ 'Link': links, 'Address': address, 'Name': name, 'Food Name': food }) df.to_csv('Fooddataset.csv', index=False, encoding='utf-8')
def parse_html(db, cursor): # create table for film data print "creating nyt_data" cursor.execute(''' drop table if exists nyt_data ''') cursor.execute( ''' create table nyt_data (path varchar(255), movie_title varchar(255), text_body text, people_names text, years text)''') # initialize final dataframe nyt_data = pd.DataFrame() # i want to print progress, so we need to know how many files we're iterating over # (inefficient/redundant but low cost) ind = 1 total_files = 0 for root, dirs, files in os.walk("./nyt"): for file in files: if file.endswith('.html'): total_files += 1 print "processing nyt data" # loop through the files in the directory for root, dirs, files in os.walk("./nyt"): for name in files: if name.endswith((".html")): stdout.write("\r%d/%d" % (ind, total_files)) stdout.flush() df = {} # initialize beautifulsoup stuff path = root + "/" + name article = open(path, 'r') soup = BeautifulSoup(article) df['path'] = path # get the movie title first. in almost all cases it's either in a div with # id = movieTitle or in an itemprop tag div_title = soup.find('div', {'id': 'movieTitle'}) itemprop_title = soup.find(itemprop="name") # best source seems to be itemprop_title. however it's only available after 2008. # if 2007/2008, use the div_title # this method seems to correctly get 496/500 titles, the missing ones seem to be one-off # changes in formatting. is there a better way to get these remaining 4? hmmm if re.match(".*/200[78]/.*", path): if div_title: df['movie_title'] = re.sub("\(.*", "", div_title.text).strip() else: df['movie_title'] = None else: if itemprop_title: df['movie_title'] = itemprop_title.text else: df['movie_title'] = None # now look for names of people # grab the body of the story, which can be in two places depending when the article was written if re.match(".*/200[78]/.*", path): text_body = soup.findAll('div', attrs={'class': 'articleBody'}) if len(text_body) > 1: text_body = [x.text for x in text_body] text_body = ' '.join(text_body) df['text_body'] = text_body elif len(text_body) == 1: text_body = text_body[0].text df['text_body'] = text_body else: df['text_body'] = None else: text_body = soup.findAll('div', {'id': 'story-body'}) if len(text_body) > 1: text_body = [x.text for x in text_body] text_body = ' '.join(text_body) df['text_body'] = text_body elif len(text_body) == 1: text_body = text_body[0].text df['text_body'] = text_body else: df['text_body'] = None # use nltk to pull words that look like names # method borrowed from: # http://timmcnamara.co.nz/post/2650550090/extracting-names-with-6-lines-of-python-code # i chose this method over others (there are lots of ways to do this!) because it is # relatively short/simple and seemed to produce good results (not too much non-name stuff, # caught most of the names) director_names = "" if text_body: for sent in nltk.sent_tokenize(text_body): for chunk in nltk.ne_chunk( nltk.pos_tag(nltk.word_tokenize(sent))): if hasattr(chunk, 'node'): if chunk.node == "PERSON": director_names = director_names + " " + ' '.join( c[0] for c in chunk.leaves()) df['people_names'] = director_names else: df['people_names'] = None # pull out things that look like years # also include year that article was published if text_body: years = re.findall("\d{4}", text_body) years_string = " ".join(years) + " " + re.search( "\d{4}", path).group(0) df['years'] = years_string else: df['years'] = None # put data for this page in our main dataframe nyt_data = nyt_data.append(df, ignore_index=True) ind += 1 # put the main dataframe in the db stdout.write("\n") print "inserting data to db" nyt_data.to_sql(con=db, name='nyt_data', flavor='mysql', index=False, if_exists='append') db.commit()
def getFighterDetails(self, fighterID): """ Return fighter details for a given fighter ID from sherdog.com's fightfinder. Arguments: fighterID -- A String containing the fighter's numeric ID from sherdog.com Returns: fighterDetails -- A dictionary containing the fighters details as scraped from sherdog.com fighterDetails keys: ID -- Fighter's ID name -- Fighter's full name nickName -- Fighter's current nickname association -- Fighter's current camp/association height -- Fighter's height weight -- Fighter's weight (in lbs) birthDate -- Fighter's date of birth city -- Fighter's city of birth country -- Fighter's country of birth thumbUrl -- URL of fighter image """ # initialise empty dict to store fighter details fighterDetails = {} # set all keys to empty values fighterDetails['ID'] = '' fighterDetails['name'] = '' fighterDetails['nickName'] = '' fighterDetails['association'] = '' fighterDetails['height'] = '' fighterDetails['weight'] = '' fighterDetails['birthDate'] = '' fighterDetails['city'] = '' fighterDetails['country'] = '' # store fighter ID in dict fighterDetails['ID'] = fighterID # generate fighter url url = self.__fighterURL__ % fighterID # retrieve html and initialise beautifulsoup object for parsing soup = BeautifulSoup(self.getHtml(url)) bio = soup.find("div", {"class" : "module bio_fighter"}) fighterDetails['name'] = bio.h1.find(text=True) try: fighterDetails['nickName'] = bio.find("span", {"class" : "nickname"}).em.string except Exception: fighterDetails['nickName'] = '' try: fighterDetails['association'] = bio.find("span", {"class" : "item association"}).strong.string heightTemp = bio.find("span", {"class" : "item height"}) except Exception: fighterDetails['association'] = '' fighterDetails['height'] = ("%s %s" % (heightTemp.strong.string, heightTemp.findAll(text=True)[3].string)).rstrip() weightTemp = bio.find("span", {"class" : "item weight"}) fighterDetails['weight'] = ("%s %s" % (weightTemp.strong.string, weightTemp.findAll(text=True)[3].string)).rstrip() fighterDetails['birthDate'] = bio.find("span", {"class" : "item birthday"}).findAll(text=True)[0].rsplit(":")[1].strip() try: birthpTemp = bio.find("span", {"class" : "item birthplace"}) fighterDetails['city'] = birthpTemp.findAll(text=True)[1].strip() fighterDetails['country'] = birthpTemp.strong.string except Exception: fighterDetails['city'] = '' fighterDetails['country'] = '' """ Commented # check if row contains 'city' and store to fighterDetails dict elif infoItem[0].string.rstrip(' ').rstrip('\n') == 'City': fighterDetails['city'] = infoItem[1].string.rstrip(' ').rstrip('\n') # check if row contains 'country' and store to fighterDetails dict elif infoItem[0].string.rstrip(' ').rstrip('\n') == 'Country': fighterDetails['country'] = infoItem[1].string.rstrip(' ').rstrip('\n') # find and store url for fighter image fighterDetails['thumbUrl'] = soup.find("span", {"id" : "fighter_picture"}).img['src'] """ # return scraped details return fighterDetails
RANGE = 1000000 def strip_tags(s): start = s.find("<") if start == -1: return s # No tags end = s.find(">", start)+1 return s[:start]+strip_tags(s[end:]) for i in xrange(RANGE): url = "http://www.ukrlp.co.uk/ukrlp/ukrlp_provider.page_pls_provDetails?x=&pn_p_id=1%07d&pv_status=VERIFIED" % (i) html = scraperwiki.scrape(url) bs = BeautifulSoup(html) data = {} content = bs.find("div", {"class": "pod_main_body"}) title = content.findAll("div", {"class":"provhead"}) data["ukprn"] = int(title[0].string.split()[1]) data["name"] = strip_tags(str(title[1])).strip() raw = str(content) start = raw.find('<div class="assoc">Legal address</div>') end = raw.find('<div class="assoc">Primary contact address</div>') address = "" for line in raw[start:end].split("<br />")[1:]: l = line.strip() if l.startswith("<strong>"): s = l.find(">") + 1 e = l.find("</strong>") data["legal_"+l[s:e].strip()[:-1]] = l[e+9:].strip() else: if l == "<": continue
def enhance(request): check_login(request) session = request.session google_resource_id = "" slideshare_id = "" embed_google = False embed_slideshare = False not_converted = True show_iframe = False form = Form(request, schema=QuestionAnswerSchema) validate_form = form.validate() print form.all_errors() if session.has_key('google-resource-id'): google_resource_id = session['google-resource-id'] if session.has_key('slideshare_id'): slideshare_id = session['slideshare_id'] if fetch_slideshow_status(slideshare_id) == "2": not_converted = False show_iframe = True if google_resource_id != "": embed_google = True if slideshare_id != "": embed_slideshare = True templatePath = "templates/google_ss_preview.pt" if validate_form: introductory_paragraphs = request.POST.get('introductory_paragraphs') question_count = 0 cnxml = session[ "cnxml"] + """<content><section id="intro-section-title"><title id="introtitle">Introduction</title><para id="introduction-1">""" + introductory_paragraphs + """</para></section><section id="slides-embed"><title id="slide-embed-title">View the slides</title><figure id="ss-embed-figure"><media id="slideshare-embed" alt="slideshare-embed"><iframe src="http://www.slideshare.net/slideshow/embed_code/""" + slideshare_id + """" width="425" height="355" /></media></figure></section>""" for i in range(1, 6): form_question = request.POST.get('question-' + str(i)) if form_question: form_radio_answer = request.POST.get( 'radio-' + str(i) ) #this give us something like 'answer-1-1'. so our solution is this question_count += 1 if question_count == 1: cnxml += """<section id="test-section"><title>Test your knowledge</title>""" itemlist = "" for j in range(1, 10): try: form_all_answers = request.POST.get('answer-' + str(i) + '-' + str(j)) if form_all_answers: itemlist += "<item>" + form_all_answers + "</item>" except: print "No element found" if form_radio_answer: solution = request.POST.get(form_radio_answer) cnxml += """<exercise id="exercise-""" + str( i ) + """"><problem id="problem-""" + str( i ) + """"><para id="para-""" + str(i) + """">""" + str( form_question ) + """<list id="option-list-""" + str( i ) + """" list-type="enumerated" number-style="lower-alpha">""" + str( itemlist) + """</list></para></problem>""" else: print "ELESE CONDUITION OF radio" solution = request.POST.get('answer-' + str(i) + '-1') cnxml += """<exercise id="exercise-""" + str( i) + """"><problem id="problem-""" + str( i) + """"><para id="para-""" + str( i) + """">""" + str( form_question) + """</para></problem>""" print "FORM RADIO ANSWER", form_radio_answer print "SOLUTION", solution cnxml += """ <solution id="solution-""" + str( i ) + """"> <para id="solution-para-""" + str( i ) + """">""" + solution + """</para></solution></exercise>""" """form_solution = request.POST.get('solution-'+str(i)) all_post_data = {"data":{"options":form_options,"solution":form_solution,"question":form_question}} for question in all_post_data: options = all_post_data[question]['options'] solution = all_post_data[question]['solution'] asked_question = all_post_data[question]['question'] optionlist="" for option in options: optionlist+="<item>"+option+"</item>""" #cnxml+="""<exercise id="exercise-"""+str(j)+""""><problem id="problem-"""+str(j)+""""><para id="para-"""+str(j)+"""">"""+str(asked_question)+"""<list id="option-list-"""+str(j)+"""" list-type="enumerated" number-style="lower-alpha">"""+str(optionlist)+"""</list></para></problem>""" #cnxml+=""" <solution id="solution-"""+str(j)+""""> <para id="solution-para-"""+str(j)+"""">"""+solution+"""</para></solution></exercise>""" #j+=1 metadata = session['metadata'] if question_count >= 1: cnxml += "</section></content></document>" else: cnxml += "</content></document>" workspaces = [(i['href'], i['title']) for i in session['login'].collections] metadata_entry = sword2cnx.MetaData(metadata) zipped_filepath = session['userfilepath'] zip_archive = zipfile.ZipFile(zipped_filepath, 'w') zip_archive.writestr("index.cnxml", cnxml) zip_archive.close() conn = sword2cnx.Connection("http://cnx.org/sword/servicedocument", user_name=session['login'].username, user_pass=session['login'].password, always_authenticate=True, download_service_document=True) collections = [{ 'title': i.title, 'href': i.href } for i in sword2cnx.get_workspaces(conn)] session['login'].collections = collections workspaces = [(i['href'], i['title']) for i in session['login'].collections] session['workspaces'] = workspaces with open(zipped_filepath, 'rb') as zip_file: deposit_receipt = conn.create( col_iri=workspaces[0][0], metadata_entry=metadata_entry, payload=zip_file, filename='upload.zip', mimetype='application/zip', packaging='http://purl.org/net/sword/package/SimpleZip', in_progress=True) session['dr'] = deposit_receipt session['deposit_receipt'] = deposit_receipt.to_xml() soup = BeautifulSoup(deposit_receipt.to_xml()) data = soup.find("link", rel="edit") edit_iri = data['href'] session['edit_iri'] = edit_iri creator = soup.find('dcterms:creator') username = session['login'].username email = creator["oerdc:email"] url = "http://connexions-oerpub.appspot.com/" post_values = { "username": username, "email": email, "slideshow_id": slideshare_id } data = urllib.urlencode(post_values) google_req = urllib2.Request(url, data) google_response = urllib2.urlopen(google_req) now_string = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') temp_dir_name = '%s-%s' % (request.session['login'].username, now_string) save_dir = os.path.join(request.registry.settings['transform_dir'], temp_dir_name) os.mkdir(save_dir) request.session['upload_dir'] = temp_dir_name cnxml = clean_cnxml(cnxml) save_cnxml(save_dir, cnxml, []) return HTTPFound(location=request.route_url('metadata')) #return HTTPFound(location=request.route_url('updatecnx')) response = { 'form': FormRenderer(form), "slideshare_id": slideshare_id, "google_resource_id": google_resource_id, "embed_google": embed_google, "embed_slideshare": embed_slideshare, "not_converted": not_converted, "show_iframe": show_iframe } return render_to_response(templatePath, response, request=request)
################## #Note from creator(Yomal Mudalige)- I have been tried to change column orders, however still it is not finalized. Hence I added prefix 'A', 'B'.etc. Thanks #Reference - Scraperwiki Tutorial 3 ################## import scraperwiki from BeautifulSoup import BeautifulSoup print "Premier League Football 2011/2011 Points Tables" html = scraperwiki.scrape('http://www.guardian.co.uk/football/premierleague') soup = BeautifulSoup(html) scraperwiki.metadata.save('data_columns', ['Team', 'Pld', 'GD', 'Pts']) data_table = soup.find("table", {"class": "full"}) rows = data_table.findAll("tr") m = 0 for row in rows: print m, row if m < 0: m = m + 1 continue else: record = {} table_cells = row.findAll("td") if table_cells: record['A- Team'] = table_cells[0].text record['B-Matches Played'] = table_cells[1].text record['C-Goal Difference'] = table_cells[2].text record['D-Points'] = table_cells[3].text print record, '------------' scraperwiki.datastore.save(["A- Team"], record)
'http://www.motogp.com/en/Results+Statistics/2011/AUS/MotoGP', 'Phillip Island', '2011' ], #['http://www.motogp.com/en/Results+Statistics/2011/MAL/MotoGP','Sepang','2011']] [ 'http://www.motogp.com/en/Results+Statistics/2011/VAL/MotoGP', 'Valencia', '2011' ] ] for entry in url: #print (entry[1]) page = mech.open(entry[0]) html = page.read() soup = BeautifulSoup(html) table = soup.find("table", {"class": "width100 marginbot10"}) col = table.findAll("tr") tds = col[2]("td") track = entry[1] season = entry[2] date = tds[0].text rider = tds[1].text time = tds[2].text speed = re.search('(.*)[^ Km/h]', tds[3].text).group(0) #print(lap) #print(track,season,rider,date,time,speed) scraperwiki.sqlite.save(unique_keys=["circuit", "season"], data={ "circuit": track, "season": season,