def get_onepage_poclist(page): info = getHtml("http://beebeeto.com/pdb" + '/?page=' + str(page)) if '' == info: return '' bt = BeautifulSoup(info) end = bt.find( 'a', { 'style': "font-size: 20px;font-weight: bold; border-bottom: 3px solid #777777;" }) if '1' == end.renderContents() and page != 1: return '' ret = bt.find('div', {'class': 'mainlist'}) ret = ret.renderContents() if ret == "": return "" retlist = [] rets = re.findall('<a href=.*?>', ret) for one in rets: if "poc-" in one: one = one.replace('<a href="', "") one = one.replace('">', "") one = one.strip() retlist.append(one) return retlist
def __parse_genres(self, data): """ Parses the list of genres. """ self.call_service(msgs.UI_ACT_SHOW_INFO, "SHOUTcast made it illegal for free software to access\n" \ "their full directory.\n" \ "You will only get the Top 10 stations listed per genre.") genres = [] soup = BeautifulSoup(data) radiopicker = soup.find("div", {"id": "radiopicker"}) #print radiopicker if (radiopicker): for genre_tag in radiopicker.findAll("li", {"class": "prigen"}): #print genre_tag name = genre_tag.a.contents[0] name = name.replace("&", "&") genres.append(name) #end for #end if if (not genres): self.__current_folder.message = "genre list not available" logging.error("SHOUTcast genre listing download failed:\n%s", logging.stacktrace()) genres.sort() return genres
def get_onepage_poclist(page): info = getHtml("http://beebeeto.com/pdb" + '/?page=' + str(page)) if '' == info: return '' bt = BeautifulSoup(info) end = bt.find('a', {'style' : "font-size: 20px;font-weight: bold; border-bottom: 3px solid #777777;"}) if '1' == end.renderContents() and page != 1: return '' ret = bt.find('div', {'class' : 'mainlist'}) ret = ret.renderContents() if ret == "": return "" retlist = [] rets = re.findall('<a href=.*?>', ret) for one in rets: if "poc-" in one: one = one.replace('<a href="', "") one = one.replace('">', "") one = one.strip() retlist.append(one) return retlist
def _get_video_links(self,html_data): soup = BeautifulSoup(''.join(html_data),convertEntities=BeautifulStoneSoup.HTML_ENTITIES) link_tds=soup.findAll('td',width='420') link_a=[] for td in link_tds: link_a.append(td.find('a')['href']) return link_a
def __parse_genres(self, data): """ Parses the list of genres. """ genres = [] soup = BeautifulSoup(data) tagcloud = soup.find("ul", {"class": "tag-cloud"}) #print tagcloud if (tagcloud): for genre_tag in tagcloud.findAll("a", {"class": "tag"}): #print genre_tag name = genre_tag["title"] href = genre_tag["href"] genres.append((name, href)) #end for #end if if (not genres): self.__current_folder.message = "genre list not available" logging.error("icecast genre listing download failed:\n%s", logging.stacktrace()) genres.sort(lambda a, b: cmp(a[0], b[0])) return genres
def _get_video_links(self,html_data): soup = BeautifulSoup(''.join(html_data),convertEntities=BeautifulStoneSoup.HTML_ENTITIES) div=soup.find('div',id='dual_list') link_divs=div.findAll('div',{'class':re.compile('^dmpi_video_item')}) link_a=[] for div in link_divs: link_a.append(div.find('a',{'class':re.compile('^dmco_simplelink video_title')})['href']) return link_a
def strainSoup(self, xml): soup = BeautifulSoup(xml) subtitle = soup.find('subtitle', attrs={'link': None}) if subtitle: _id = int(subtitle['id']) _iv = subtitle.find('iv').contents[0] _data = subtitle.data.string return _id, _iv, _data else: print "CRUNCHYROLL: --> Couldn't parse XML file."
def __parse_stations(self, data, genre): """ Parses the list of stations. """ stations = [] soup = BeautifulSoup(data) resulttable = soup.find("div", {"id": "resulttable"}) if (resulttable): for entry in resulttable.findAll("div", {"class": "dirlist"}): #print entry station = File(self) a_tag = entry.find("a", {"class": "playbutton playimage"}) playing_tag = entry.find("div", {"class": "playingtext"}) bitrate_tag = entry.find("div", {"class": "dirbitrate"}) type_tag = entry.find("div", {"class": "dirtype"}) if (not a_tag or not playing_tag or not bitrate_tag or not type_tag): continue station.resource = a_tag["href"] station.name = a_tag["title"] now_playing = playing_tag["title"] bitrate = bitrate_tag.contents[0].strip() typename = type_tag.contents[0].strip() if (typename == "MP3"): station.mimetype = "audio/mpeg" elif (typename == "AAC+"): station.mimetype = "audio/mp4" else: station.mimetype = "audio/x-unknown" station.path = File.pack_path("/" + urlquote.quote(genre, ""), station.name, bitrate, station.mimetype, station.resource, genre) station.info = "Bitrate: %s kb\n" \ "Now playing: %s" % (bitrate, now_playing) station.icon = theme.shoutcast_station.get_path() stations.append(station) #end for #end if if (not stations): self.__current_folder.message = "station list not available" logging.error("SHOUTcast station listing download failed\n%s", logging.stacktrace()) stations.sort() return stations
def getPoc(poc): info = getHtml("http://beebeeto.com/pdb/" + poc + "/") if '' == info: return '' if '<img src="/static/img/test.jpg"' in info: return '' bt = BeautifulSoup(info) ret = bt.find('pre', {'class' : "brush: python;"}) ret = ret.renderContents() if ret: return strip_tags(ret) else: return ''
def get_recent_updates(self): WIKIRAGE_URL = "http://www.wikirage.com/rss/1/" from google.appengine.api import urlfetch fetch_page = urlfetch.fetch(WIKIRAGE_URL, follow_redirects=False) from utils.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(fetch_page.content) updates = [] wiki_topics = [ guid.contents[0].split('/')[-2] for guid in soup.findAll('guid') ] import urllib for topic in wiki_topics: topic = urllib.unquote(topic) topic_name = topic.replace('_', ' ') updates.append( { 'topic_path': topic, 'topic_name': topic_name } ) return updates
def getPoc(poc): info = getHtml("http://beebeeto.com/pdb/" + poc + "/") if '' == info: return '' if '<img src="/static/img/test.jpg"' in info: return '' bt = BeautifulSoup(info) ret = bt.find('pre', {'class': "brush: python;"}) ret = ret.renderContents() if ret: return strip_tags(ret) else: return ''
def _get_video_details(self,html_data): soup= BeautifulSoup(''.join(html_data),convertEntities=BeautifulStoneSoup.HTML_ENTITIES) script=soup.find('script',text=re.compile('flashvars')) title=re.compile('flashvars.title = "(.+?)";').findall(script.string) description=re.compile('flashvars.description = "(.+?)";').findall(script.string) tags=re.compile('flashvars.tags = "(.+?)";').findall(script.string) category=re.compile('flashvars.category = "(.+?)";').findall(script.string) video=MegaVideoVideo() video.title=strip_accents(urllib.unquote(title[0].replace('+', ' '))) video.description=strip_accents(urllib.unquote(description[0].replace('+', ' '))) video.category=strip_accents(urllib.unquote(category[0].replace('+', ' '))) video.tags=strip_accents(urllib.unquote(tags[0].replace('+', ' '))) return video
def _GetPostingPage(page_num): url = urllib.urlopen(settings.SOURCE_ALL_URL + str(page_num)) data = BeautifulSoup(url.read()) postings = data.findAll('li', {'class': re.compile('hlisting')}) for posting in postings: posting_object = models.Posting() try: descr_div = BeautifulSoup(posting).findAll('div', {'class': re.compile('description')})[0] except TypeError: continue descr_div = descr_div.findAll('h3')[0] description = BeautifulSoup(descr_div).findAll('a')[0] url = BeautifulSoup(descr_div).get('href') posting_object.content = posting logging.critical(description) logging.critical(url)
def get_soup(self, page): #in case we need to meet 100k limit, truncate page. soup_url = SEMANTICPROXY_URL + str(page.url) # + TRUNCATE URL + # timeout for fetch_page (and all fetch pages) try: logging.debug('Fetching calais response') fetch_page = urlfetch.fetch(soup_url) # perform semantic analysis except: logging.debug('Unable to fetch calais response') return False soup = BeautifulSoup(fetch_page.content) #whole page try: # look for error exception = soup.findAll('exception')[0].contents[0] print exception return False except: return soup
def _get_video_info(self,video_url): ''' Return direct URL to video. ''' #get the page data=urllib.urlopen(video_url) soup = BeautifulSoup(''.join(data.read()),convertEntities=BeautifulStoneSoup.HTML_ENTITIES) #find the location of the embed code div=soup.find('noscript') if div!=None: rex= re.compile(r'mediaURL=(.*?)&', re.M) flashvars=div.contents[1].attrs[9][1].encode('utf-8') self._logger.debug('Metacafe flashvars:%s',flashvars) match=rex.search(flashvars) if match!=None: return urllib.unquote(match.group(1)) else: return None else: return None
def convertToASS(self, script): soup = BeautifulSoup(script, convertEntities=BeautifulSoup.HTML_ENTITIES) header = soup.find('subtitle_script') header = "[Script Info]\nTitle: "+header['title']+"\nScriptType: v4.00+\nWrapStyle: "+header['wrap_style']+"\nPlayResX: 624\nPlayResY: 366\n\n"; #styles = "[V4 Styles]\nFormat: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n"; styles = "[V4+ Styles]\nFormat: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n"; events = "[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n"; stylelist = soup.findAll('style') eventlist = soup.findAll('event') for style in stylelist: #styles += "Style: " + style['name'] + ", " + style['font_name'] + ", " + style['font_size'] + ", " + style['primary_colour'] + ", " + style['secondary_colour'] + ", " + style['outline_colour'] + ", " + style['back_colour'] + ", " + style['bold'] + ", " + style['italic'] + ", " + style['underline'] + ", " + style['strikeout'] + ", " + style['scale_x'] + ", " + style['scale_y'] + ", " + style['spacing'] + ", " + style['angle'] + ", " + style['border_style'] + ", " + style['outline'] + ", " + style['shadow'] + ", " + style['alignment'] + ", " + style['margin_l'] + ", " + style['margin_r'] + ", " + style['margin_v'] + ", " + style['encoding'] + "\n" style['strikeout'] + ", " + style['scale_x'] + ", " + style['scale_y'] + ", " + style['spacing'] + ", " + style['angle'] + ", " + style['border_style'] + ", " + style['outline'] + ", " + style['shadow'] + ", " + style['alignment'] + ", " + style['margin_l'] + ", " + style['margin_r'] + ", " + style['margin_v'] + ", " + style['encoding'] + "\n" styles += "Style: " + style['name'] + "," + style['font_name'] + "," + style['font_size'] + "," + style['primary_colour'] + "," + style['secondary_colour'] + "," + style['outline_colour'] + "," + style['back_colour'] + "," + style['bold'] + "," + style['italic'] + "," + style['underline'] + "," + style['strikeout'] + "," + style['scale_x'] + "," + style['scale_y'] + "," + style['spacing'] + "," + style['angle'] + "," + style['border_style'] + "," + style['outline'] + "," + style['shadow'] + "," + style['alignment'] + "," + style['margin_l'] + "," + style['margin_r'] + "," + style['margin_v'] + "," + style['encoding'] + "\n" for event in eventlist: events += "Dialogue: 0,"+event['start']+","+event['end']+","+event['style']+","+event['name']+","+event['margin_l']+","+event['margin_r']+","+event['margin_v']+","+event['effect']+","+event['text']+"\n" formattedSubs = header+styles+events return formattedSubs
def _get_video_info(self,video_url): #class="dm_widget_videoplayer" opener=CustomUrlOpener() page=opener.open(video_url) response=page.read() page.close() soup = BeautifulSoup(''.join(response),convertEntities=BeautifulStoneSoup.HTML_ENTITIES) div=soup.find('div',{'class':'dm_widget_videoplayer'}) script=div.find('script') if script!=None: urls= re.compile('addVariable\("video", "(.*?)"\);').findall(script.string) if urls!=None and len(urls)>0: return self._split_urls(urls[0]) else: return None else: self._logger.error('We couldn''t get the dailymotion url of this video: %s',video_id) return None
def _get_video_details(self,html_data): soup= BeautifulSoup(''.join(html_data),convertEntities=BeautifulStoneSoup.HTML_ENTITIES) script=soup.find('script',text=re.compile('flashvars')) t=soup.find('h1',{'class':'dmco_title'}) title=t.string if t != None else '' d=soup.find('div',id='video_description') description=d.string if d!=None else None c=soup.find('a',{'class':re.compile('fromchannel_link')}) category=c.string if c!=None else None tags_el=soup.find('div',{'class':re.compile('tags_cont')}).findAll('a') tags_list=[] for a in tags_el: tags_list.append(a.string) tags=','.join(tags_list) video=DailyMotionVideo() video.title=strip_accents(title) video.description=strip_accents(description) if description!=None else None video.category=strip_accents(category) video.tags=strip_accents(tags) return video
def parse_summary(self, summary, link): #summary = escape.utf8(summary) soup = BeautifulSoup(summary) for script in list(soup.findAll('script')): script.extract() for o in soup.findAll(onload=True): del o['onload'] for script in list(soup.findAll('noscript')): script.extract() for attr in self.remove_attributes: for x in soup.findAll(attrs={attr:True}): del x[attr] for tag in self.remove_tags: for x in soup.findAll(tag['name']): x.extract() for base in list(soup.findAll(['base', 'iframe'])): base.extract() #for p in list(soup.findAll(['p', 'div'])): # p['style'] = 'text-indent:2em' img_count = 1 for img in list(soup.findAll('img')): if self.noimage or img_count >= self.max_images: img.extract() else: image_url = absolute_path(img['src'], link) image = self.down_image(image_url, link) if image: img['src'] = image else: img.extract() img_count = img_count + 1 return soup.renderContents('utf-8')
def update_bill(self, bill): """ Check if a bill exists in datastore, and update its stats. """ this_bill = Bill.get_by_key_name(bill['title']) logging.info(bill['title']) if this_bill is None: this_bill = self.create_bill(bill) is_new_bill = True else: is_new_bill = False this_bill.rank = bill['rank'] import urllib self.request_args = {'bill_id': bill['id']} self.formatted_args = urllib.urlencode(self.request_args) from google.appengine.api import urlfetch fetch_page = urlfetch.fetch( url = OPENCONGRESS_INFO_URL + self.formatted_args, method = urlfetch.GET) from utils.BeautifulSoup import BeautifulSoup document = BeautifulSoup(fetch_page.content) property_count = 0 this_bill.introduction_date = str(document.findAll('li')[property_count]).split('</strong> ')[1].split('</li>')[0] this_bill.status = str(document.findAll('li')[property_count + 1]).split('</strong> ')[1].split('</li>')[0] if this_bill.status == "This Bill Has Become Law": property_count = -1 # no next step else: this_bill.next_step = str(document.findAll('li')[property_count + 2]).split('</strong> ')[1].split('</li>')[0] this_bill.latest_action = str(document.findAll('li')[property_count + 3]).split('</strong> ')[1].split('</li>')[0] if len( this_bill.latest_action ) > 68: this_bill.latest_action = " ".join(this_bill.latest_action.split(' ')[:9]) + "..." this_bill.sponsor = str(document.findAll('li')[property_count + 4]).split('</strong> ')[1].split('</li>')[0].decode('utf-8') this_bill.sponsor_name = this_bill.sponsor.split("[")[0] self.save.append(this_bill) if is_new_bill: self.send_email_updates(this_bill) return
def get_excerpt(content): soup = BeautifulSoup(content) return soup.getText()[:100]
def __parse_stations(self, data, genre): """ Parses the list of stations. """ stations = [] next_page_url = "" soup = BeautifulSoup(data) resulttable = soup.find("div", {"id": "content"}) if (resulttable): for entry in resulttable.findAll("tr"): #print entry station = File(self) try: station.name = entry.find("span", { "class": "name" }).a.contents[0] except: continue try: now_playing = entry.find("p", { "class": "stream-onair" }).contents[1] except: now_playing = "" station.resource = _ICECAST_BASE + entry.find( "td", { "class": "tune-in" }).find("a")["href"] try: bitrate = entry.find("td", { "class": "tune-in" }).findAll("p", {"class": "format"})[1]["title"] except: bitrate = "-" try: typename = entry.find("a", { "class": "no-link" }).contents[0].strip() except: typename = "" if (typename == "MP3"): station.mimetype = "audio/mpeg" elif (typename == "AAC+"): station.mimetype = "audio/mp4" else: station.mimetype = "audio/x-unknown" station.path = "/" + genre + "/" + \ self.__encode_station(station.name, bitrate, station.mimetype, station.resource, genre) station.info = "Bitrate: %s\n" \ "Now playing: %s" % (bitrate, now_playing) station.icon = theme.icecast_station.get_path() stations.append(station) #end for pager_tag = resulttable.find("ul", {"class": "pager"}) if (pager_tag): link = pager_tag.findAll("a")[-1] if (not link.contents[0].isdigit()): # must be an arrow next_page_url = link["href"] #end if #end if #end if if (not stations): self.__current_folder.message = "station list not available" logging.error("icecast station listing download failed\n%s", logging.stacktrace()) return (stations, next_page_url)