def render(self): content = cache.get(self.content_url) # If the page is not cached, retrieve it if content == None: opener = urllib2.build_opener() content = opener.open(self.content_url, timeout=5).read() # Save the page in cache cache.set(self.content_url, content) soup = BeautifulSoup(content) # Make links absolute, quoted from http://stackoverflow.com/a/4468467: for tag in soup.findAll('a', href=True): tag['href'] = urlparse.urljoin(self.content_url, tag['href']) # If there's no element specified, use the BODY. # Otherwise find the element with given id. if self.element_id == "": html = soup.find("body").renderContents() else: html = str(soup.find(id=self.element_id)) return html
def handler(sock, url): htmlsource=sock.read() soup = BeautifulSoup(htmlsource) content = soup.find(id=re.compile("postmessage_\d+"),name="td") if content is None: return "failed to read content" return unicode(content)
def get_organic_data(html_data): bs = BeautifulSoup(str(html_data)) div_filter = bs.find('div', {'id': 'ires'}) if div_filter: contents = div_filter.findAll('li', {'class': 'g'}) return contents return None
def getRes(self): url = self.getResURL() page = urllib2.urlopen(url).read()#.decode('GBK').encode('utf-8') soup = BeautifulSoup(page) main_wrapper = soup.findAll('div', {'class': 'main_wrapper'})[0] #print main_wrapper.prettify() clr_after = main_wrapper.findAll('div', {'class': 'clr_after'})[0] #print clr_after.prettify() items = clr_after.findAll('div', {'class': 'main'})[0] #print items.prettify() items1 = items.findAll('div', {'class': 'lowpriceList'})[0] print items1.prettify().decode('utf-8').encode('gbk') items2 = items1.findAll('div', {'id': 'hdivResultTable'})[0] #print items2.prettify().decode('utf-8').encode('gbk') for item in items2: print item inc = str(item.findAll('td', {'class': 'col3'})[0].contents[0].string) fly_time = str(item.findAll('td', {'class': 'col4'})[0].contents[0].string) _time = str(item.findAll('td', {'class': 'col2'})[0].contents[0].string) _discount = str(item.findAll('span', {'class': 'disc'})[0].contents[0].string) _price = str(item.findAll('span', {'class': 'pr'})[0].contents[0].string) print inc#.decode('utf-8').encode('gbk') print fly_time#.decode('utf-8').encode('gbk') print _time#.decode('utf-8').encode('gbk') print _discount.decode('utf-8').encode('gbk') print _price.decode('utf-8').encode('gbk')
def fetch_trains(place_from, place_to, date): key = 'trains_' + place_from + '_' + place_to + '_' + str(date) data = memcache.get(key) #@UndefinedVariable if data != None: return data params = {'fromName': place_from, 'toName': place_to, 'when': utils.date_serialize(date), 'search_type': 'suburban'} url = 'http://m.rasp.yandex.ru/search?' + urllib.urlencode(params) response = urlfetch.fetch(url) html = response.content soup = BeautifulSoup(html) list_node = soup.find("ul", { "class" : "b-holster b-search-result" }) if list_node != None: regex = re.compile(r'<.*?>') b_nodes = list_node.findAll("b") result = [] for b_node in b_nodes: data = regex.split(b_node.renderContents()) try: time = [datetime.datetime.strptime(x, '%H:%M').time() for x in data] result.append(TrainTiming(time[0], time[1])) except: pass memcache.add(key, result, 60*60) #@UndefinedVariable return result
def render(self): # TODO: fix and enable caching # content = cache.get(self.content_url) content = None url = self.content_url # If the page is not cached, retrieve it if content == None: opener = urllib2.build_opener() content = opener.open(url, timeout=5).read() # Save the page in cache # cache.set(self.content_url, content) soup = BeautifulSoup(content) # TODO: Disabled. Add GET parameter support and enable. # Make links absolute, quoted from http://stackoverflow.com/a/4468467: #for tag in soup.findAll('a', href=True): # tag['href'] = urlparse.urljoin(self.content_url, tag['href']) # If there's no element specified, use the BODY. # Otherwise find the element with given id. if self.element_id == "": html = soup.find("body").renderContents() else: html = str(soup.find(id=self.element_id)) return html
def get_episodes(): """docstring for get_episodes""" html = retrieve_url("http://www.rtlklub.hu/most/musorok/automania") soup = BeautifulSoup(html, fromEncoding="utf-8") print soup.originalEncoding episodesHtml = soup.findAll("div", attrs={"class" : "video-img-cont-catchup cont-first"}) """ result <div class="video-img-cont-catchup cont-first" id="5217"> <div class="video-date">okt 24.<span>12:15</span></div> <a href="http://www.rtlklub.hu/most/5217_automania_09-10-24" class="video-img"> <img src="http://www.rtlklub.hu/most/files/thumbnails/005/217/2.jpg" width="120" height="90" alt="Autómánia 09-10-24" title="Autómánia 09-10-24" /> </a> <a href="javascript:void(0)" class="video-add" id="5217-0"> <img src="http://www.rtlklub.hu/most/style/img/add_video_icon.png" alt="Add a kedvenceid közé" title="Add a kedvenceid közé" /> </a> <div class="img-height-wide"></div> <h2> <a href="http://www.rtlklub.hu/most/5217_automania_09-10-24">Autómánia 09-10-24</a> </h2> <p>Toyota Prius, Aprilia Tuono 1000R, Honda Accord 2.2 I-DTEC</p> </div> """ episodes = [] #print len(episodesHtml) for episode in episodesHtml: episodes.append({"title":episode.h2.a.string, "url":episode.h2.a['href'], "thumb":episode.a.img['src']}) #print episodes return episodes
def parse(property_id, ratecode='SPGCP'): valid_property = False hotel_props = {'id': property_id} property_url = "%s?propertyID=%s" % (starwood_url, property_id) logging.debug("Property URL: %s" % property_url) starwood_response = urlfetch.fetch(url=property_url, deadline=10) if starwood_response: try: soup = BeautifulSoup(starwood_response.content).find(attrs={'id': 'propertyHighlight'}).find(attrs={'class': 'propertyContainer'}) except: soup = None if soup: try: hotel_props['name'] = unicode(soup.find("a", "propertyName").contents[0]).strip() hotel_props['category'] = int(str(soup.find("span", "spgCategory").contents[0]).split()[-1]) valid_property = True except: pass if valid_property: hotel_props['address'] = StarwoodParser.parse_address(soup) #hotel_props['awards'] = StarwoodParser.parse_starwood(soup.find("div", "tabsContentContainer").findAll("div", "tabContent")) hotel_props['image_url'] = str("http://www.starwoodhotels.com%s" % (soup.find("img", "propertyThumbnail")['src'])) return valid_property and hotel_props or None
def parse_response(self): soup = BeautifulSoup(self.response) head = soup.find("head") self.max_points = int(_get_value_from_soup(head, "meta", "value", {"name": "max-points"}, 0)) if _get_value_from_soup(head, "meta", "value", {"name": "status"}) == "accepted": self.is_accepted= True meta_title = _get_value_from_soup(head, "meta", "content", {"name": "DC.Title"}) if meta_title: self.meta["title"] = meta_title else: title = soup.find("title") if title: self.meta["title"] = title.contents self.meta["description"] = _get_value_from_soup(head, "meta", "content", {"name": "DC.Description"}, "") points = _get_value_from_soup(head, "meta", "value", {"name": "points"}) if points != None: self.points = int(points) self.is_graded = True self.is_accepted= True exercise_div = soup.body.find("div", {"id": "exercise"}) if exercise_div != None: self.content = exercise_div.renderContents() else: self.content = soup.body.renderContents()
def parse_summary(self, summary, link): """处理文章""" soup = BeautifulSoup(summary) for span in list(soup.findAll(attrs={ "style" : "display: none;" })): span.extract() for attr in self.remove_attributes: for x in soup.findAll(attrs={attr:True}): del x[attr] for tag in soup.findAll(self.remove_tags): tag.extract() img_count = 0 for img in list(soup.findAll('img')): if (self.max_image_number >= 0 and img_count >= self.max_image_number) \ or img.has_key('src') is False \ or img['src'].startswith("http://union.vancl.com/") \ or img['src'].startswith("http://www1.feedsky.com/") \ or img['src'].startswith("http://feed.feedsky.com/~flare/"): img.extract() else: try: localimage = self.down_image(img['src'], link) if localimage: img['src'] = localimage img_count = img_count + 1 else: img.extract() except Exception, e: print e img.extract()
def get_organic_data(html_data): bs = BeautifulSoup(str(html_data)) div_filter = bs.find('div',{'id':'ires'}) if div_filter: contents = div_filter.findAll('li',{'class':'g'}) return contents return None
def seturl(self): '''URLとURLからフェッチして保存します''' user = common.currentuser() if not user: common.error(self, 404, "User not found.") return ct = models.CustomTest.all().ancestor(user).get() if not ct: ct = models.CustomTest(parent=user) ct.setbypost(self.request.POST) if not ct.rss_link: soup = Soup(defines.defaulttesthtml) else: result = urlfetch.fetch(ct.rss_link) if result.status_code != 200: common.error(self, 200, "Url Fetch Error") return soup = Soup(result.content) try: ct.data = soup.prettify().decode('UTF-8') except ValueError, message: common.error(self, 200, message) return
def handler(sock, url): htmlsource=sock.read().decode('gb18030','replace').encode('utf-8') soup = BeautifulSoup(htmlsource) content = soup.find("td",{"class":"jiawenzhang-type"}) if content is None: return "content not found" return unicode(content)
def view_page(slug): page = Page.gql("WHERE slug = :1", slug)[0] content = BeautifulSoup(page.content) codes = content.findAll('pre') for code in codes: code.contents[0].replaceWith(controllers.prettify_code(code.contents[0])) page.content = str(content) return render_template('cms_view_page.html', page=page)
def get_genres(self, url): """Return the available genres from the homepage.""" html = download_page(url) ul_tags = BS(html, parseOnlyThese=SS('ul', {'class': 'menu'})) dirs = [{'name': a.span.string, 'url': urljoin(self.base_url, a['href'] + '&limit=0'), 'mode': '1'} for a in ul_tags.findAll('a')] self.add_dirs(dirs)
def location(self,ip): try: self.current_page = self.br.open('http://www.114best.com/ip/114.aspx?w=%s' % ip) except Exception: return "Earth" soup = BeautifulSoup(self.current_page) lo = soup.findAll('div', { "id" : "output" })[0].findAll('b')[1].text.encode('utf-8','ignore') return lo
def getPresentation(self): base_url = 'http://my.yingjiesheng.com/xuanjianghui_province_' for i in range(1, 35): #取出34[1-34]个省份的未来两天的招聘信息 url = base_url + str(i) + '.html' #print url try: page = self.getRes(url) soup = BeautifulSoup(page) except: #url打开失败 continue #取出所有的倒计时 try: #当前城市可能未来一段时间没有宣讲会信息 countdowns = soup.findAll('div', {'class': 'list_topic'}) y_m_d2, y_m_d3 = '', ''; #记录第二天和第三天的宣讲会日期 first, second = -1, -1 #第二天和第三天的宣讲会出现的名字为campusTalk的table下标.其位置是和倒计时出现的div保持错开一个位置 # 因为第0个名为campusTalk的table是表格标题栏,从第1个开始才是宣讲会的信息,因此day初始化为1 day = 1 for countdown in countdowns: cd = string.atoi(countdown.contents[0].contents[2].string) if cd > 2: #倒计时超过2天的宣讲会,暂不考虑 break elif cd == 1: #第二天要举行的宣讲会【倒计时剩1天】 first = day y_m_d2 = countdown.contents[1].string elif cd == 2: #第三天要举行的宣讲会【倒计时剩2天】 second = day y_m_d3 = countdown.contents[1].string day = day + 1 # first是第2天信息,second是第三天的信息,假如为-1,表示那天没有宣讲会 if first != -1: tables = soup.findAll('table', {'class':'campusTalk'})[first] trs = tables.findAll('tr') for tr in trs: tds = tr.findAll('td') city = tds[0].a.string.strip() school = tds[1].a.string.strip() addr = tds[2].string.strip() inc = tds[3].a.string.strip() try: # 有些宣讲会未标出具体开始时间[H-M-S] pdate = y_m_d2 + ' ' + tds[4].string except Exception, e: pdate = y_m_d2 #那么只记录年月日即可 self.presentations.append(CPresentation(city, inc, school, pdate, addr)) if second != -1: tables = soup.findAll('table', {'class':'campusTalk'})[second] trs = tables.findAll('tr') for tr in trs: tds = tr.findAll('td') city = tds[0].a.string.strip() school = tds[1].a.string.strip() addr = tds[2].string.strip() inc = tds[3].a.string.strip() try: pdate = y_m_d3 + ' ' + tds[4].string except: pdate = y_m_d3 self.presentations.append(CPresentation(city, inc, school, pdate, addr)) except:
def assert_no_error_message_in_response(self, response): """Check that response has no error messages.""" soup = BeautifulSoup(response) el = soup.find("p", "alert-error") if el: self.fail("error message found in response unexpectedly: {}".format(el.contents)) el = soup.findAll("label", "alert-error") if el: self.fail("error message found in response unexpectedly: {}".format(el.contents))
def Items(self): itemsprocessed = [] cnt4debug = 0 opener = URLOpener(self.host) decoder = AutoDecoder() for section, url in self.feeds: content = None cnt4debug += 1 if IsRunInLocal and cnt4debug > 1: break result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 and content: logging.error('err(%d) to fetch %s.' % (status_code,url)) continue if self.feed_encoding: content = content.decode(self.feed_encoding) else: content = decoder.decode(content) content = self.preprocess(content) feed = feedparser.parse(content) for e in feed['entries']: # 全文RSS中如果有广告或其他不需要的内容,可以在postprocess去掉 desc = self.postprocess(e.description) desc = self.FragToXhtml(desc, e.title, self.feed_encoding) if self.keep_image: soup = BeautifulSoup(content) self.soupbeforeimage(soup) for img in soup.findAll('img'): imgurl = img['src'] if not imgurl.startswith('http') and not imgurl.startswith('www'): imgurl = self.urljoin(url, imgurl) imgresult = opener.open(imgurl) imgcontent = imgresult.content if imgresult.status_code == 200 else None if imgcontent: imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype if imgtype == 'jpeg': fnimg = "%d.jpg" % random.randint(10000,99999999) else: fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype) img['src'] = fnimg yield (imgmime, imgurl, fnimg, imgcontent) self.soupprocessex(soup) desc = soup.renderContents('utf-8').decode('utf-8') soup = None if e.title not in itemsprocessed and desc: itemsprocessed.append(e.title) yield (section, e.link, e.title, desc)
def view_post(category_slug, post_slug): category = Category.gql("WHERE slug = :1", category_slug)[0] all_posts = Post.all().order('-date_created') post = [x for x in all_posts if x.category.slug == category_slug and x.slug == post_slug][0] content = BeautifulSoup(post.content) codes = content.findAll('pre') for code in codes: code.contents[0].replaceWith(controllers.prettify_code(code.contents[0])) post.content = unicode(content) return render_template('cms_view_post.html', post=post)
def fetchSong(url, viewCount): try: #Get song info from url songInfo = {} _get = url.split('?')[1] tokens = _get.split('&') for token in tokens: toks = token.split('=') songInfo[toks[0]] = int(toks[1]) #fetch the html lyricsWeb = urllib2.urlopen(url) webContent = lyricsWeb.read() lyricsWeb.close() soup = BeautifulSoup(webContent) lyrics = soup.findAll(id="mylrc")[0].contents author = soup.findAll(attrs={'class' : 'link_hb'})[0].contents[0] album = soup.findAll(attrs={'class' : 'link_hb'})[1].contents[0] title = soup.findAll(attrs={'class' : 'link_hb'})[2].contents[0] #print lyrics lyricsText = '' for line in lyrics: for t in line: lyricsText += t #Construct the xml root = ET.Element("xml") doc = ET.SubElement(root, "doc") sidNode = ET.SubElement(doc, "sid") sidNode.text = str(songInfo[u'sid']) aidNode = ET.SubElement(doc, "aid") aidNode.text = str(songInfo[u'aid']) lidNode = ET.SubElement(doc, "lid") lidNode.text = str(songInfo[u'lid']) titleNode = ET.SubElement(doc, "title") titleNode.text = title authorNode = ET.SubElement(doc, "author") authorNode.text = author viewCountNode = ET.SubElement(doc, "viewCount") viewCountNode.text = str(viewCount) lyricsNode = ET.SubElement(doc, "lyrics") lyricsNode.text = lyricsText #Construct the tree tree = ET.ElementTree(root) filename = lyricsDbPath + str(songInfo['lid']) + ".txt" tree.write(filename, "utf-8") except: pass
def get_refresh_url(page_content): try: page_soup = BeautifulSoup(page_content) for meta_tag in page_soup.findAll('meta'): if meta_tag['http-equiv'].lower() == 'refresh': refresh_url = meta_tag['content'].split('URL=')[1] return refresh_url except: pass return None
def get_shows(): """docstring for get_shows""" html = retrieve_url(BASE_URL) soup = BeautifulSoup(html, fromEncoding="utf-8") #print soup #print "Autómánia" showsHtml = soup.find(id="topnav04-ul").findAll("li") shows = [] for show in showsHtml: shows.append({"title" : show.a.string, "url" : show.a['href']}) return shows
def strip_professors(html, name): """Returns list of professor matches""" profs = [] table = BeautifulSoup(html).find('div', {'id': 'ratingTable'}) if table is None: logging.debug(html[500:]) return profs split = name[:-1].upper().split(',') qLast = split[0] try: qFirst = split[1] except: qFirst = '' rows = table.findAll('div', {'class': re.compile(r"entry (odd|even)")}) for row in rows: divName = row.find('div', {'class': 'profName'}) anchor = divName.find('a') profName = unicode(anchor.renderContents().strip(), 'utf-8', 'ignore').upper() try: firstName = profName.split(',')[1] except: firstName = '' # logging.info('Searching against: ' + profName) if profName.startswith(qLast) and qFirst in firstName: href = 'http://www.ratemyprofessors.com/' + anchor['href'].strip() profDept = row.find('div', {'class': 'profDept'}).renderContents().strip() profRatings = row.find('div', {'class': 'profRatings'}).renderContents().strip() profQuality = row.find('div', {'class': 'profAvg'}).renderContents().strip() profEasiness = row.find('div', {'class': 'profEasy'}).renderContents().strip() profHot = row.find('div', {'class': re.compile(r".*\bprofHot\b.*")}).renderContents().strip() if profHot == 'Hot': profHot = '✓' else: profHot = ' ' profs.append({ 'name': profName, 'href': href, 'dept': profDept, 'ratings': profRatings, 'quality': profQuality, 'easiness': profEasiness, 'hot': profHot }) return profs
def parse_page(writer, catalogue, page=1): print 'Parsing page %s' % page url = urllib.urlopen(URL % (catalogue, page)) soup = BeautifulSoup(url) table = soup.find('table', attrs={'class': 'snippets'}) for tr in table.findAll('tr'): # get name of the page name = tr.td.h4.a.string # get URL of the page url = tr.td.h4.a['href'].encode('utf-8') #get stats info stats = '?' stats_element = tr.find('p', attrs={'class': 'Stats'}) if stats_element: stats = stats_element.strong.nextSibling.string[1:-11].replace(' ', '') if stats == 'wtrakc': stats = '?' # get price price = tr.find('td', attrs={'class': 'Price'}).strong.string[0:-12] # calculate CPM cpm = '?' try: cpm = (float(price)*30) / int(stats) * 1000 except: cpm = '?' # write to the file row = [name, url, stats, price.replace('.', ','), str(cpm).replace('.', ',')] print row writer.writerow(row) # find last page of the catalogue anchors = soup.findAll('a', href=re.compile('/networks/[0-9]+/websites\?page=[0-9]+')) if not anchors: return pages = [] for anchor in anchors: number = re.match('/networks/[0-9]+/websites\?page=([0-9]+)', anchor['href']).group(1) pages.append(int(number)) pages.sort() last = pages[-1] # parse next page if exists if last > page: next = page + 1 parse_page(writer, catalogue, next)
def assert_warning_message_in_response(self, response, message=""): """Check if response contains one or more warning messages. Assume warning messages rendered as <p class="alert-warning"> elements. """ soup = BeautifulSoup(response) alert = soup.findAll("p", "alert-warning") self.assertGreater(len(alert), 0, "no warning message found in response") if message: found = str(alert[0]).find(message) self.assertGreater(found, 0)
def assert_has_div_with_ID(self, response, id_attr): """Check if response contains a Div with a particular ID attribute. <div id="<some-id>"> elements. """ soup = BeautifulSoup(response) alert = soup.findAll("div", id=id_attr) if alert: self.assertGreater(len(alert), 0, "No Div tag with, id=%s, in response" % str(id_attr)) else: self.fail("No Div tag with, id=%s, in response" % str(id_attr))
def load(self): league_soup = BeautifulSoup(urllib2.urlopen(league_url).read()) if league_soup: self.name = League.name(league_soup) self.mb = MessageBoard(self) team_rows = league_soup.find('table', attrs={'id': 'standingstable'}).tbody.findAll('tr') teams = [Team(self, team_id) for team_id in xrange(1,2)] # xrange(1, len(team_rows) + 1)] for team in teams: print "%s, %s, \"%s\" %s\n" % (team.name, team.record, team.smack, team.roster) '''
def getWeatherInfo(self, my_phone): for user in self.users: url = self.url + self.province_map[user.province.encode('gbk')] + '/' + self.city_map[user.city.encode('gbk')] + '.html' #构造查询URL #print url page = urllib2.urlopen(url).read().decode('GBK').encode('utf-8') soup = BeautifulSoup(page) #print page.decode('utf-8').encode('gbk') city_body = soup.find('div', {'class': 'w365border city_body'}) weather_info = city_body.findAll('div', {'class': 'weather_div'}) self.sendSMS(my_phone, weather_info[1], user) #明天的天气 self.sendSMS(my_phone, weather_info[2], user) # 后天的天气
def __init__(self, league, team_id): team_url = "http://%s%s%s/%d?pak=%s" % (league.sport, YAHOO_FB, league.league_id, team_id, league.access_code) team_soup = BeautifulSoup(urllib2.urlopen(team_url).read()).find('div', attrs={'id': 'bd'}) team_info_soup = team_soup.find('div', attrs={'id': 'teaminfo'}) self.name = clean(team_info_soup.h1.em.contents[0]) self.record = Team.parse_record(team_info_soup) try: self.smack = clean(team_info_soup.find('p', attrs={'id': 'smacktext'}).contents[0]) except: self.smack = '' self.roster = Roster(league, team_id).players
def league_settings(league_id, access_code): response = urlfetch.fetch("http://football.fantasysports.yahoo.com/f1/%s/settings?pak=%s" % (league_id, access_code)) settings_table_soup = BeautifulSoup(response.content).find("table", attrs={'id': 'settings-table'}) positions = defaultdict(int) for p in [str(s.strip()) for s in settings_table_soup.findAll('tr')[23].find('td', attrs={'width': '410'}).b.contents[0].split(',')]: positions[p] += 1 #bench_spots = roster_positions.count('BN') return positions
def strip_search(html): form_html = BeautifulSoup(html).find('form', action='http://websoc.reg.uci.edu/') #replace form submit with our own link form_html['action'] = '/schedules' #remove 'Display Text Results' button text_buttons = form_html.findAll(attrs={"class" : "banner-width"}) for i in text_buttons: i.replaceWith('<p id=\"submit-container\"><input type="submit" value="Display Results" name="Submit"></p>') return str(form_html)
def get_script_urls(self, url, html): script_urls = [] scripts = BeautifulSoup(html, parseOnlyThese=SoupStrainer('script')) for tag in scripts: if tag.has_key('src'): script_urls.append(self.get_absolute_url(url, tag['src'])) return script_urls
def find_video_links(self, html_message): soup = BeautifulSoup(html_message) embeds = soup('embed') tags = [] for video in embeds: tags.append(db.Text(str(video))) return tags
def find_image_links(self, html_message): soup = BeautifulSoup(html_message) images = soup('img') links = [] for img in images: links.append(db.Link(img['src'])) return links
def hyphenate_html(html, language='en-us', hyphenator=None, blacklist_tags= ('code', 'tt', 'pre', 'head', 'title', 'script', 'style', 'meta', 'object', 'embed', 'samp', 'var', 'math', 'select', 'option', 'input', 'textarea') ): r""" Hyphenate a fragement of HTML >>> hyphenate_html('<p>It is <em>beautiful</em> outside today!</p>') u'<p>It is <em>beau­ti­ful</em> out­side today!</p>' >>> hyphenate_html('O paralelepipedo atrevessou a rua', 'pt-br') u'O pa­ra­le­le­pi­pe­do atre­ves­sou a rua' Content inside <code>, <tt>, and <pre> blocks is not hyphenated >>> hyphenate_html('Document: <code>document + page_status</code>') u'Doc­u­ment: <code>document + page_status</code>' Short words are not hyphenated >>> hyphenate_html("<p>The brave men, living and dead.</p>") u'<p>The brave men, liv­ing and dead.</p>' """ # Load hyphenator if one is not provided if not hyphenator: hyphenator = get_hyphenator_for_language(language) # Create HTML tree soup = BeautifulSoup(html) # Recursively hyphenate each element hyphenate_element(soup, hyphenator, blacklist_tags) return unicode(soup)
def updateprojectlist(): print "updating the projects list" conn = httplib.HTTPConnection("android.git.kernel.org") conn.request("GET", "/") res = conn.getresponse() if res.status == httplib.OK: data = res.read() #print data conn.close() soup = BeautifulSoup(data) table = soup.body.table #print soup.body.table # filter tags = table.findAll('a', attrs={ 'class': 'list', 'title': None, 'href': re.compile('^/\?p') }) #print tags projectlist = [] for tag in tags: projectlist.append(tag.string) file = open(currentdir + "/" + listfilename, "w") #writelines won't add the '\n' file.writelines(map(lambda x: x.strip() + "\n", projectlist)) file.close() else: print "fail to download the page: ", res.status, res.reason
def clawdata(data): data = urllib.urlencode(data) url = "http://www.powerball.com/powerball/pb_nbr_history.asp" response = urllib2.urlopen(url, data) soup = BeautifulSoup(response) for tag in soup.findAll(valign="middle"): csoup = BeautifulSoup(str(tag)) dictIssue = dict() dictIssue["issueDate"] = "" dictIssue["luckNum"] = [] if csoup.tr != None: for tag in csoup.tr.findAll('td'): if re.search("[0-9]+\/[0-9]+\/[0-9]{4}", str(tag.text)): dictIssue["issueDate"] = str(tag.text) elif str(tag.text) != " ": dictIssue["luckNum"].append(int(tag.text)) print dictIssue
def getViewCount(songTitle): try: youtube = 'http://gdata.youtube.com/feeds/api/videos?v=2&max-results=1&q=' #songTitle = urllib2.quote(songTitle) #print songTitle url = youtube + songTitle #print url web = urllib2.urlopen(url) content = web.read() web.close() soup = BeautifulSoup(content) stats = soup.findAll('yt:statistics') return int(stats[0]['viewcount']) except: return 0
def parse_organic_contents(raw_content, organic_pos): data_dict = {} data_dict['position'] = organic_pos b = BeautifulSoup(raw_content) rtitle = b.find('a') headline = p.sub('', str(rtitle)) data_dict['title'] = headline display_url = parse_display_url(str(raw_content)) data_dict['display_url'] = display_url rhref = b.find('a', href=True) url = str(rhref['href']) data_dict['url'] = ul.unquote(url) rtext = b.findAll('div', {'class': 's'}) text = p.sub('', str(rtext)) data_dict['text'] = text.replace(']', '').replace('[', '') return data_dict
def parse_summary(self, summary, link): """处理文章""" soup = BeautifulSoup(summary) for span in list(soup.findAll(attrs={"style": "display: none;"})): span.extract() for attr in self.remove_attributes: for x in soup.findAll(attrs={attr: True}): del x[attr] for tag in soup.findAll(self.remove_tags): tag.extract() img_count = 0 for img in list(soup.findAll('img')): if (self.max_image_number >= 0 and img_count >= self.max_image_number) \ or img.has_key('src') is False \ or img['src'].startswith("http://union.vancl.com/") \ or img['src'].startswith("http://www1.feedsky.com/") \ or img['src'].startswith("http://feed.feedsky.com/~flare/"): img.extract() else: try: localimage = self.down_image(img['src'], link) if localimage: img['src'] = localimage img_count = img_count + 1 else: img.extract() except Exception, e: print e img.extract()
def parse_response(self): soup = BeautifulSoup(self.response) head = soup.find("head") self.max_points = int( _get_value_from_soup(head, "meta", "value", {"name": "max-points"}, 0)) if _get_value_from_soup(head, "meta", "value", {"name": "status"}) == "accepted": self.is_accepted = True meta_title = _get_value_from_soup(head, "meta", "content", {"name": "DC.Title"}) if meta_title: self.meta["title"] = meta_title else: title = soup.find("title") if title: self.meta["title"] = title.contents self.meta["description"] = _get_value_from_soup( head, "meta", "content", {"name": "DC.Description"}, "") points = _get_value_from_soup(head, "meta", "value", {"name": "points"}) if points != None: self.points = int(points) self.is_graded = True self.is_accepted = True exercise_div = soup.body.find("div", {"id": "exercise"}) if exercise_div != None: self.content = exercise_div.renderContents() else: self.content = soup.body.renderContents()
def parse_summary(self, summary, ref): """处理文章内容,去除多余标签并处理图片地址""" soup = BeautifulSoup(summary) for span in list(soup.findAll(attrs={"style": "display: none;"})): span.extract() for attr in self.remove_attributes: for x in soup.findAll(attrs={attr: True}): del x[attr] for tag in soup.findAll(self.remove_tags): tag.extract() img_count = 0 images = [] for img in list(soup.findAll('img')): if (krconfig.max_image_per_article >= 0 and img_count >= krconfig.max_image_per_article) \ or img.has_key('src') is False : img.extract() else: try: if img['src'].encode('utf-8').lower().endswith( ('jpg', 'jpeg', 'gif', 'png', 'bmp')): localimage, fullname = self.parse_image(img['src']) # 确定结尾为图片后缀,防止下载非图片文件(如用于访问分析的假图片) if os.path.isfile(fullname) is False: images.append({ 'url': img['src'], 'filename': fullname, 'referer': ref }) if localimage: img['src'] = localimage img_count = img_count + 1 else: img.extract() else: img.extract() except Exception, e: logging.info("error: %s" % e) img.extract()
def TPB(book=None, test=False): errmsg = '' provider = "TPB" host = lazylibrarian.CONFIG['TPB_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/s/?") cat = 0 # 601=ebooks, 102=audiobooks, 0=all, no mag category if 'library' in book: if book['library'] == 'AudioBook': cat = 102 elif book['library'] == 'eBook': cat = 601 elif book['library'] == 'magazine': cat = 0 sterm = makeUnicode(book['searchterm']) page = 0 results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 next_page = True while next_page: params = { "q": book['searchterm'], "category": cat, "page": page, "orderby": "99" } searchURL = providerurl + "?%s" % urllib.urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) soup = BeautifulSoup(result) # tpb uses a named table table = soup.find('table', id='searchResult') if table: rows = table.findAll('tr') else: rows = [] if len(rows) > 1: rows = rows[1:] # first row is headers for row in rows: td = row.findAll('td') if len(td) > 2: try: title = unaccented( str(td[1]).split('title=')[1].split('>')[1].split( '<')[0]) magnet = str(td[1]).split('href="')[1].split('"')[0] size = unaccented( td[1].text.split(', Size ')[1].split('iB')[0]) size = size.replace(' ', '') mult = 1 try: if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(td[2].text) except ValueError: seeders = 0 if minimumseeders < int(seeders): # no point in asking for magnet link if not enough seeders magurl = '%s/%s' % (host, magnet) result, success = fetchURL(magurl) if not success: logger.debug('Error fetching url %s, %s' % (magurl, result)) else: magnet = None new_soup = BeautifulSoup(result) for link in new_soup.findAll('a'): output = link.get('href') if output and output.startswith('magnet'): magnet = output break if not magnet or not title: logger.debug('Missing magnet or title') else: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': magnet, 'tor_size': str(size), 'tor_type': 'magnet', 'priority': lazylibrarian.CONFIG['TPB_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) next_page = True else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn( 'Maximum results page search reached, still more results available' ) next_page = False logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def TPB(book=None): provider = "TPB" host = lazylibrarian.CONFIG['TPB_HOST'] if not str(host)[:4] == "http": host = 'http://' + host providerurl = url_fix(host + "/s/?q=" + book['searchterm']) params = {"category": "601", "page": "0", "orderby": "99"} searchURL = providerurl + "&%s" % urllib.urlencode(params) result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) result = False results = [] if result: logger.debug(u'Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 soup = BeautifulSoup(result) try: table = soup.findAll('table')[0] rows = table.findAll('tr') except Exception: # no results = no table in result page rows = [] c1 = [] c2 = [] if len(rows) > 1: for row in rows[1:]: if len(row.findAll('td')) > 2: c1.append(row.findAll('td')[1]) c2.append(row.findAll('td')[2]) for col1, col2 in zip(c1, c2): try: title = unaccented( str(col1).split('title=')[1].split('>')[1].split('<')[0]) magnet = str(col1).split('href="')[1].split('"')[0] size = unaccented(col1.text.split(', Size ')[1].split('iB')[0]) mult = 1 try: if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(col2.text) except ValueError: seeders = 0 if minimumseeders < seeders: # no point in asking for magnet link if not enough seeders magurl = '%s/%s' % (host, magnet) result, success = fetchURL(magurl) if not success: logger.debug('Error fetching url %s, %s' % (magurl, result)) else: magnet = None new_soup = BeautifulSoup(result) for link in new_soup.findAll('a'): output = link.get('href') if output and output.startswith('magnet'): magnet = output break if not magnet or not title: logger.debug('Missing magnet or title') else: if minimumseeders < seeders: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': magnet, 'tor_size': str(size), 'tor_type': 'magnet' }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error(u"An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug( u"Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, book['searchterm'])) return results
def GEN(book=None): provider = "libgen" host = lazylibrarian.CONFIG['GEN_HOST'] if not str(host)[:4] == "http": host = 'http://' + host searchURL = url_fix( host + "/search.php?view=simple&open=0&phrase=0&column=def&res=100&req=" + book['searchterm']) result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) elif '111' in result: # looks like libgen has ip based access limits logger.error( 'Access forbidden. Please wait a while before trying %s again.' % provider) else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) result = False results = [] if result: logger.debug(u'Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) soup = BeautifulSoup(result) try: table = soup.findAll('table')[2] rows = table.findAll('tr') except Exception: # no results = no table in result page rows = [] c1 = [] c2 = [] c7 = [] c8 = [] if len(rows) > 1: for row in rows[1:]: if len(row.findAll('td')) > 8: c1.append(row.findAll('td')[1]) c2.append(row.findAll('td')[2]) c7.append(row.findAll('td')[7]) c8.append(row.findAll('td')[8]) for col1, col2, col7, col8 in zip(c1, c2, c7, c8): try: author = unaccented(col1.text) title = unaccented( str(col2).split('>')[2].split('<')[0].strip()) link = str(col2).split('href="')[1].split('?')[1].split('"')[0] size = unaccented(col7.text).upper() extn = col8.text try: mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 if link and title: if author: title = author.strip() + ' ' + title.strip() if extn: title = title + '.' + extn bookURL = url_fix(host + "/ads.php?" + link) bookresult, success = fetchURL(bookURL) if not success: # may return 404 if no results, not really an error if '404' in bookresult: logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) else: logger.debug(bookURL) logger.debug('Error fetching data from %s: %s' % (provider, bookresult)) bookresult = False if bookresult: url = None new_soup = BeautifulSoup(bookresult) for link in new_soup.findAll('a'): output = link.get('href') if output and output.startswith('/get.php'): url = output break if url: url = url_fix(host + url) results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': 'direct' }) logger.debug('Found %s, Size %s' % (title, size)) except Exception as e: logger.error(u"An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug( u"Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, book['searchterm'])) return results
def makelocal(self, feed_data, feed_idx, force_full_text=0): '''生成解析结果''' global updated_feeds global feedlock try: local = { 'idx': feed_idx, 'entries': [], 'title': feed_data.feed['title'], } item_idx = 1 for entry in feed_data.entries: if item_idx > krconfig.max_items_number: break try: published_datetime = datetime(*entry.published_parsed[0:6]) except: published_datetime = self.parsetime(entry.published) if datetime.utcnow( ) - published_datetime > krconfig.max_old_date: break try: local_author = entry.author except: local_author = "null" local_entry = { 'idx': item_idx, 'title': entry.title, 'published': (published_datetime + krconfig.timezone).strftime("%Y-%m-%d %H:%M:%S"), 'url': entry.link, 'author': local_author, } if force_full_text: local_entry['content'], images = self.force_full_text( entry.link) else: try: local_entry['content'], images = self.parse_summary( entry.content[0].value, entry.link) except: local_entry['content'], images = self.parse_summary( entry.summary, entry.link) local_entry['stripped'] = ''.join( BeautifulSoup( local_entry['content'], convertEntities=BeautifulSoup.HTML_ENTITIES).findAll( text=True))[:200] local['entries'].append(local_entry) for i in images: imgq.put(i) item_idx += 1 if len(local['entries']) > 0: if feedlock.acquire(): updated_feeds.append(local) feedlock.release() else: feedlock.release() logging.info("from feed{} update {} items.".format( feed_idx, len(local['entries']))) else: logging.info("feed{} has no update.".format(feed_idx)) except Exception, e: logging.error("fail(feed{}): {}".format(feed_idx, e))
def TDL(book=None): provider = "torrentdownloads" host = lazylibrarian.TDL_HOST if not str(host)[:4] == "http": host = 'http://' + host providerurl = url_fix(host) params = {"type": "search", "cid": "2", "search": book['searchterm']} searchURL = providerurl + "/rss.xml?%s" % urllib.urlencode(params) try: request = urllib2.Request(searchURL) if lazylibrarian.PROXY_HOST: request.set_proxy(lazylibrarian.PROXY_HOST, lazylibrarian.PROXY_TYPE) request.add_header('User-Agent', USER_AGENT) data = urllib2.urlopen(request, timeout=90) except (socket.timeout) as e: logger.debug('Timeout fetching data from %s' % provider) data = False except (urllib2.HTTPError, urllib2.URLError, ssl.SSLError) as e: # may return 404 if no results, not really an error if hasattr(e, 'code') and e.code == 404: logger.debug(searchURL) logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) else: logger.debug(searchURL) if hasattr(e, 'reason'): errmsg = e.reason else: errmsg = str(e) logger.debug('Error fetching data from %s: %s' % (provider, errmsg)) data = False results = [] minimumseeders = int(lazylibrarian.NUMBEROFSEEDERS) - 1 if data: logger.debug(u'Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) d = feedparser.parse(data) if len(d.entries): for item in d.entries: try: title = item['title'] seeders = int(item['seeders']) link = item['link'] size = int(item['size']) url = None if link and minimumseeders < seeders: # no point requesting the magnet link if not enough seeders request = urllib2.Request(link) if lazylibrarian.PROXY_HOST: request.set_proxy(lazylibrarian.PROXY_HOST, lazylibrarian.PROXY_TYPE) request.add_header('User-Agent', USER_AGENT) conn = urllib2.urlopen(request, timeout=90) result = conn.read() url = None new_soup = BeautifulSoup(result) for link in new_soup.findAll('a'): output = link.get('href') if output and output.startswith('magnet'): url = output break if minimumseeders < int(seeders): if not url or not title: logger.debug('Missing url or title') else: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error(u"An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug( u"Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, book['searchterm'])) return results
def view(): addon_handle = int(sys.argv[1]) addon = xbmcaddon.Addon() addonname = addon.getAddonInfo('name') args = urlparse.parse_qs(sys.argv[2][1:]) xbmcplugin.setContent(addon_handle, 'movies') cat=args.get('cat', None) page = args.get('page', None) link = args.get('link', None) catalogues=[{'label':'\x56\x69\x64\x65\x6F\x20\x4D\xE1\xBB\x9B\x69'.decode('utf-8'),'id':'video/new/'}, {'label':'Video Hot','id':'video/hot/'}] #play link if link!=None: link_video=link[0] if link_video.startswith(web_url): r = requests.get(link[0]) html = r.text #xbmc.log(html.encode('utf-8')) soup = BeautifulSoup(html) video_src=soup.find('embed', attrs={'id':'zplayer'}) video_flashvars=video_src.get('flashvars') args_video = urlparse.parse_qs(video_flashvars) link_video=args_video['file'][0] xbmc.Player().play(link_video) return #Load cats if cat==None: for cat in catalogues: li = xbmcgui.ListItem(cat['label']) urlList = CMDTools.build_url(base_url,{'web':get_Web_Name(), 'cat':cat['id']}) xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li, isFolder=True) xbmc.executebuiltin('Container.SetViewMode(501)') xbmcplugin.endOfDirectory(addon_handle) return #Load noi dung cat if cat!=None: if page==None: page=1 else: page=int(page[0]) r = requests.get(web_url+cat[0]+str(page)) html = r.text xbmc.log(html.encode('utf-8')) soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) data_List=soup.findAll('a',attrs={'class':'play'}) #load item menu for item in data_List: link_item=web_url+item.get('href') if item.get('data-youtubeid')!='': link_item="plugin://plugin.video.youtube/play/?video_id="+item.get('data-youtubeid') img_item=item.find('img') img_src=img_item.get('src') img_alt=img_item.get('alt') li = xbmcgui.ListItem(img_alt) li.setThumbnailImage(img_src) li.setInfo(type='image',infoLabels="") urlList = CMDTools.build_url(base_url,{'web':get_Web_Name(), 'link':link_item, 'type':cat[0]}) xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li) #Tao nut next li = xbmcgui.ListItem("Next") urlList=CMDTools.build_url(base_url,{'web':web_name, 'cat':cat[0],'page': page+1}); xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li, isFolder=True) xbmc.executebuiltin('Container.SetViewMode(501)') #xbmc.executebuiltin("ClearSlideshow") #xbmc.executebuiltin("SlideShow(,,notrandom)") xbmcplugin.endOfDirectory(addon_handle) return xbmcplugin.endOfDirectory(addon_handle)
def soup(string, **kwargs): """Create a BeautifulSoup parse object from a string""" from lib.BeautifulSoup import BeautifulSoup return BeautifulSoup(string, **kwargs)
def GEN(book=None, prov=None): errmsg = '' provider = "libgen.io" if prov is None: prov = 'GEN' host = lazylibrarian.CONFIG[prov + '_HOST'] if not host.startswith('http'): host = 'http://' + host search = lazylibrarian.CONFIG[prov + '_SEARCH'] if not search or not search.endswith('.php'): search = 'search.php' if 'index.php' not in search and 'search.php' not in search: search = 'search.php' if search[0] == '/': search = search[1:] page = 1 results = [] next_page = True while next_page: if 'index.php' in search: params = { "s": book['searchterm'], "f_lang": "All", "f_columns": 0, "f_ext": "All" } else: params = { "view": "simple", "open": 0, "phrase": 0, "column": "def", "res": 100, "req": book['searchterm'] } if page > 1: params['page'] = page providerurl = url_fix(host + "/%s" % search) searchURL = providerurl + "?%s" % urllib.urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) elif '111' in result: # looks like libgen has ip based access limits logger.error( 'Access forbidden. Please wait a while before trying %s again.' % provider) errmsg = result else: logger.debug(searchURL) logger.debug('Error fetching page data from %s: %s' % (provider, result)) errmsg = result result = False if result: logger.debug(u'Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) try: soup = BeautifulSoup(result) try: table = soup.findAll('table')[2] # un-named table if table: rows = table.findAll('tr') except IndexError: # no results table in result page rows = [] if 'search.php' in search and len(rows) > 1: rows = rows[1:] for row in rows: author = '' title = '' size = '' extn = '' link = '' td = row.findAll('td') if 'index.php' in search and len(td) > 3: try: res = str( BeautifulStoneSoup( td[0].text, convertEntities=BeautifulStoneSoup. HTML_ENTITIES)) author = formatAuthorName(res) title = str( BeautifulStoneSoup( td[2].text, convertEntities=BeautifulStoneSoup. HTML_ENTITIES)) temp = str(td[4]) temp = temp.split('onmouseout')[1] extn = temp.split('">')[1].split('(')[0] size = temp.split('">')[1].split('(')[1].split( ')')[0] size = size.upper() link = temp.split('href=')[1].split('"')[1] except IndexError as e: logger.debug( 'Error parsing libgen index.php results: %s' % str(e)) elif 'search.php' in search and len(td) > 8: try: res = str( BeautifulStoneSoup( td[1].text, convertEntities=BeautifulStoneSoup. HTML_ENTITIES)) author = formatAuthorName(res) title = str( td[2]).split('>')[2].split('<')[0].strip() title = str( BeautifulStoneSoup( title, convertEntities=BeautifulStoneSoup. HTML_ENTITIES)) link = str(td[2]).split('href="')[1].split( '?')[1].split('"')[0] size = unaccented(td[7].text).upper() extn = td[8].text except IndexError as e: logger.debug( 'Error parsing libgen search.php results; %s' % str(e)) if not size: size = 0 else: try: mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 if link and title: if author: title = author.strip() + ' ' + title.strip() if extn: title = title + '.' + extn if not link.startswith('http'): if "/ads.php?" in link: url = url_fix(host + link) else: url = url_fix(host + "/ads.php?" + link) else: url = redirect_url(host, link) bookresult, success = fetchURL(url) if not success: # may return 404 if no results, not really an error if '404' in bookresult: logger.debug( u"No results found from %s for %s" % (provider, book['searchterm'])) else: logger.debug(url) logger.debug( 'Error fetching link data from %s: %s' % (provider, bookresult)) errmsg = bookresult bookresult = False if bookresult: url = None try: new_soup = BeautifulSoup(bookresult) for link in new_soup.findAll('a'): output = link.get('href') if output: if output.startswith( 'http' ) and '/get.php' in output: url = output break elif '/get.php' in output: url = '/get.php' + output.split( '/get.php')[1] break elif '/download/book' in output: url = '/download/book' + output.split( '/download/book')[1] break if url and not url.startswith('http'): url = url_fix(host + url) else: url = redirect_url(host, url) except Exception as e: logger.debug( 'Error parsing bookresult for %s: %s' % (link, str(e))) url = None if url: results.append({ 'bookid': book['bookid'], 'tor_prov': provider + '/' + search, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': 'direct', 'priority': lazylibrarian.CONFIG[prov + '_DLPRIORITY'] }) logger.debug('Found %s, Size %s' % (title, size)) next_page = True except Exception as e: logger.error(u"An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn( 'Maximum results page search reached, still more results available' ) next_page = False logger.debug( u"Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, book['searchterm'])) return results, errmsg
def KAT(book=None): provider = "KAT" host = lazylibrarian.CONFIG['KAT_HOST'] if not str(host)[:4] == "http": host = 'http://' + host providerurl = url_fix(host + "/usearch/" + book['searchterm']) params = {"category": "books", "field": "seeders", "sorder": "desc"} searchURL = providerurl + "/?%s" % urllib.urlencode(params) result, success = fetchURL(searchURL) if not success: # seems KAT returns 404 if no results, not really an error if '404' in result: logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) result = False results = [] if result: logger.debug(u'Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 soup = BeautifulSoup(result) try: table = soup.findAll('table')[1] rows = table.findAll('tr') except Exception: # no results = no table in result page rows = [] c0 = [] c1 = [] c3 = [] if len(rows) > 1: for row in rows[1:]: if len(row.findAll('td')) > 3: c0.append(row.findAll('td')[0]) c1.append(row.findAll('td')[1]) c3.append(row.findAll('td')[3]) for col0, col1, col3 in zip(c0, c1, c3): try: title = unaccented( str(col0).split('cellMainLink">')[1].split('<')[0]) # kat can return magnet or torrent or both. magnet = '' url = '' mode = 'torrent' try: magnet = 'magnet' + str(col0).split( 'href="magnet')[1].split('"')[0] mode = 'magnet' except IndexError: pass try: url = 'http' + str(col0).split('href="http')[1].split( '.torrent?')[0] + '.torrent' mode = 'torrent' except IndexError: pass if not url or (magnet and url and lazylibrarian.CONFIG['PREFER_MAGNET']): url = magnet mode = 'magnet' try: size = str(col1.text).replace(' ', '').upper() mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(col3.text) except ValueError: seeders = 0 if not url or not title: logger.debug('Missing url or title') elif minimumseeders < seeders: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': mode }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error(u"An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug( u"Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, book['searchterm'])) return results
def determine_min_sdk(): """ Determines the minimum SDK version supported by the vulnerable application\n As a fallback, it allows the user to search Google PlayStore to identify the minimum SDK version if the data is unavailable in manifest.xml """ #determine minimum supported versions common.minSdkVersion = 0 common.sdk = common.xmldoc.getElementsByTagName("uses-sdk") determineSdk = '' if len(common.sdk) > 0: if 'android:minSdkVersion' in common.sdk[0].attributes.keys(): try: common.minSdkVersion = common.sdk[0].attributes[ 'android:minSdkVersion'].value logger.info( common.config.get('qarkhelper', 'MIN_SDK_VERSION') + str(common.minSdkVersion)) except Exception as e: common.logger.error( "Something went wrong trying to determine the version from the manifest: " + str(e)) if common.minSdkVersion == 0: if common.source_or_apk == 2: common.minSdkVersion = find_gradle() if common.minSdkVersion == 0: common.logger.info( "We were unable to find the minimum SDK version in your source." ) determineSdk = 'm' else: logger.info( common.config.get('qarkhelper', 'MIN_SDK_VERSION') + str(common.minSdkVersion)) else: common.compare(common.sdk.length, 1, common.config.get('qarkhelper', 'USESDK_MISS'), 'false') print common.config.get('qarkhelper', 'GEN_OUTPUT_WARN') while True: determineSdk = raw_input( "Which option would you prefer? (P)lay, (M)anual") if determineSdk.lower() in ('p', 'm'): break else: determineSdk = raw_input("Please enter either (p) or (m):") if determineSdk.lower() == 'p': #get package name from manifest if possible #make call to Play store #determine API version from https://play.google.com/store/apps/details?id=<package name> # will need to adjust the sdk[0] value for the checks below for a in common.xmldoc.getElementsByTagName('manifest'): if 'package' in a.attributes.keys(): print common.config.get('qarkhelper', 'PACK_FOUND') package_name = a.attributes['package'].value print package_name else: package_name = raw_input( common.config.get('qarkhelper', 'NO_PACK_NAME')) try: logger.info( common.config.get('qarkhelper', 'DETERMINING_SDK_VERSION')) play_url = "https://play.google.com/store/apps/details?id=" play_url += package_name print play_url page = urllib2.urlopen(play_url) html = BeautifulSoup(page.read()) play_version = html.find(itemprop="operatingSystems") plat_version = re.findall('\d+.\d+', play_version.contents[0]) if plat_version: plat_version = [str(item) for item in plat_version] api_plat_map = [] api_plat_map.append(['1', '1.0']) api_plat_map.append(['2', '1.1']) api_plat_map.append(['3', '1.5']) api_plat_map.append(['4', '1.6']) api_plat_map.append(['5', '2.0']) api_plat_map.append(['6', '2.0.1']) api_plat_map.append(['7', '2.1']) api_plat_map.append(['8', '2.2']) api_plat_map.append(['9', '2.3']) api_plat_map.append(['10', '2.3.3']) api_plat_map.append(['11', '3.0']) api_plat_map.append(['12', '3.1']) api_plat_map.append(['13', '3.2']) api_plat_map.append(['14', '4.0']) api_plat_map.append(['15', '4.0.3']) api_plat_map.append(['16', '4.1']) api_plat_map.append(['17', '4.2']) api_plat_map.append( ['18', '4.3'] ) #Webviews have critical vuln, no more patches from Google api_plat_map.append(['19', '4.4']) api_plat_map.append( ['20', '4.4'] ) # This is actually 4.4W, a wearable only build, I'm assuming it is the same as 4.4 for our purposes api_plat_map.append(['21', '5.0']) api_plat_map.append( ['22', '5.1'] ) # This is latest version, we'll assume this for newer, until update #TODO - double check this, adding 5.1 may have broken it for a in api_plat_map: if StrictVersion(str( plat_version[0])) >= StrictVersion(str(a[1])): common.minSdkVersion = a[0] logger.info( common.config.get('qarkhelper', 'MIN_SDK_VERSION') + str(common.minSdkVersion)) manual = raw_input( common.config.get('qarkhelper', 'SDK_VALUE_MANUAL')) else: print common.config.get('qarkhelper', 'CANT_DET_PLAY') #BUG - not processing the cases of wanting to enter if manually, if the retrieval of the play version is broken except HTTPError, e: print str(e) logger.error( common.config.get('qarkhelper', 'MIN_SDK_PLAY_STORE_FAILED')) elif (determineSdk.lower() == 'm' or common.minSdkVersion == 0): #does not actually become 1, just needs a value, since it wasn't found, so we assume worst case print common.term.cyan + common.term.bold + str( common.config.get('qarkhelper', 'NO_MIN_SDK')).decode( 'string-escape').format(t=common.term) enterSdk = raw_input( common.config.get('qarkhelper', 'PROMPT_MIN_SDK')) if enterSdk.lower() == 'y': sdkinput = 0 while True: sdkinput = int( raw_input( common.config.get('qarkhelper', 'PROMPT_VER') + common.config.get('qarkhelper', 'MAX_API_VERSION') + common.config.get('qarkhelper', 'PROMPT_VER2'))) if 0 < int(sdkinput) <= int( common.config.get('qarkhelper', 'MAX_API_VERSION')): common.minSdkVersion = int(sdkinput) break else: common.minSdkVersion = 7
def get_charset_from_html(self, html): return BeautifulSoup(html).originalEncoding
def KAT(book=None, test=False): errmsg = '' provider = "KAT" host = lazylibrarian.CONFIG['KAT_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/usearch/" + urllib.quote(book['searchterm'])) params = {"category": "books", "field": "seeders", "sorder": "desc"} searchURL = providerurl + "/?%s" % urllib.urlencode(params) sterm = makeUnicode(book['searchterm']) result, success = fetchURL(searchURL) if not success: # seems KAT returns 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success results = [] if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 soup = BeautifulSoup(result) rows = [] try: table = soup.findAll('table')[1] # un-named table if table: rows = table.findAll('tr') except IndexError: # no results table in result page rows = [] if len(rows) > 1: rows = rows[1:] # first row is headers for row in rows: td = row.findAll('td') if len(td) > 3: try: title = unaccented( str(td[0]).split('cellMainLink">')[1].split('<')[0]) # kat can return magnet or torrent or both. magnet = '' url = '' mode = 'torrent' try: magnet = 'magnet' + str( td[0]).split('href="magnet')[1].split('"')[0] mode = 'magnet' except IndexError: pass try: url = 'http' + str(td[0]).split('href="http')[1].split( '.torrent?')[0] + '.torrent' mode = 'torrent' except IndexError: pass if not url or (magnet and url and lazylibrarian.CONFIG['PREFER_MAGNET']): url = magnet mode = 'magnet' try: size = str(td[1].text).replace(' ', '').upper() mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(td[3].text) except ValueError: seeders = 0 if not url or not title: logger.debug('Missing url or title') elif minimumseeders < int(seeders): results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': mode, 'priority': lazylibrarian.CONFIG['KAT_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
except: pass ########################start of main################################### for i in range(startId, endId): url = "http://lyrics.oiktv.com/singer.php?sid=" + str(i) #lyricsWeb = urllib2.urlopen("http://lyrics.oiktv.com/singer.php?sid=51") lyricsWeb = urllib2.urlopen(url) webContent = lyricsWeb.read() lyricsWeb.close() soup = BeautifulSoup(webContent) pages = soup.findAll('a') wantedPages = [] for page in pages: if re.search("&page=", page['href']): #print page['href'] url = page['href'] wantedPages.append(url) if len(wantedPages) > 1: #find those who has more than 20 albums maxPageNum = 1 #Max 1 page for each singer pageNum = 0 maxSongNum = 250 songNum = 0
def TDL(book=None, test=False): errmsg = '' provider = "torrentdownloads" host = lazylibrarian.CONFIG['TDL_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host) params = {"type": "search", "cid": "2", "search": book['searchterm']} searchURL = providerurl + "/rss.xml?%s" % urllib.urlencode(params) sterm = makeUnicode(book['searchterm']) data, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in data: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, data)) errmsg = data data = False if test: return success results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 if data: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) d = feedparser.parse(data) if len(d.entries): for item in d.entries: try: title = item['title'] seeders = int(item['seeders']) link = item['link'] size = int(item['size']) url = None if link and minimumseeders < int(seeders): # no point requesting the magnet link if not enough seeders # TDL gives us a relative link result, success = fetchURL(providerurl + link) if success: new_soup = BeautifulSoup(result) for link in new_soup.findAll('a'): output = link.get('href') if output and output.startswith('magnet'): url = output break if not url or not title: logger.debug('Missing url or title') else: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': 'magnet', 'priority': lazylibrarian.CONFIG['TDL_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def WWT(book=None, test=False): errmsg = '' provider = "WorldWideTorrents" host = lazylibrarian.CONFIG['WWT_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/torrents-search.php") sterm = makeUnicode(book['searchterm']) cat = 0 # 0=all, 36=ebooks, 52=mags, 56=audiobooks if 'library' in book: if book['library'] == 'AudioBook': cat = 56 elif book['library'] == 'eBook': cat = 36 elif book['library'] == 'magazine': cat = 52 page = 0 results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 next_page = True while next_page: params = {"search": book['searchterm'], "page": page, "cat": cat} searchURL = providerurl + "/?%s" % urllib.urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # might return 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) soup = BeautifulSoup(result) try: tables = soup.findAll('table') # un-named table table = tables[2] if table: rows = table.findAll('tr') except IndexError: # no results table in result page rows = [] if len(rows) > 1: rows = rows[1:] # first row is headers for row in rows: td = row.findAll('td') if len(td) > 3: try: title = unaccented( str(td[0]).split('title="')[1].split('"')[0]) # can return magnet or torrent or both. magnet = '' url = '' mode = 'torrent' try: magnet = 'magnet' + str( td[0]).split('href="magnet')[1].split('"')[0] mode = 'magnet' except IndexError: pass try: url = url_fix(host + '/download.php') + \ str(td[0]).split('href="download.php')[1].split('.torrent"')[0] + '.torrent' mode = 'torrent' except IndexError: pass if not url or (magnet and url and lazylibrarian.CONFIG['PREFER_MAGNET']): url = magnet mode = 'magnet' try: size = str(td[1].text).replace(' ', '').upper() mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(td[2].text) except ValueError: seeders = 0 if not url or not title: logger.debug('Missing url or title') elif minimumseeders < int(seeders): results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': mode, 'priority': lazylibrarian.CONFIG['WWT_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) next_page = True else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn( 'Maximum results page search reached, still more results available' ) next_page = False logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
prefixurl = "https://android.git.kernel.org/" # "git://android.git.kernel.org/" is so easy to lead to time out currentdir = os.path.abspath(os.path.dirname(sys.argv[0])) #the dir of the source repositorydir = ".git" os.chdir(currentdir) # change the work directory, getcwd() conn = httplib.HTTPConnection("android.git.kernel.org") conn.request("GET","/") res = conn.getresponse() if res.status == httplib.OK: data = res.read(); #print data conn.close() soup = BeautifulSoup(data) #print soup.prettify() table = soup.body.table #print soup.body.table # filter tags = table.findAll('a', attrs = {'class' : 'list', 'title': None , 'href' : re.compile('^/\?p')}) #print tags projectlist = [] for tag in tags: projectlist.append(tag.string) file = open(currentdir+"/list.txt","w") #writelines won't add the '\n' file.writelines( map( lambda x: x.strip()+"\n", projectlist ) ); file.close()