Python BeautifulSoup示例，lib.BeautifulSoup.BeautifulSoup Python示例

示例#1

0

显示文件

文件： models.py 项目： OpenDSA/OpenDSA-devserver

 def render(self):
     content         =  cache.get(self.content_url) 
     
     # If the page is not cached, retrieve it
     if content == None:
         opener      = urllib2.build_opener()
         content     = opener.open(self.content_url, timeout=5).read()
         
         # Save the page in cache
         cache.set(self.content_url, content)
     
     soup            = BeautifulSoup(content)
     
     # Make links absolute, quoted from http://stackoverflow.com/a/4468467:
     for tag in soup.findAll('a', href=True):
         tag['href'] = urlparse.urljoin(self.content_url, tag['href'])
     
     # If there's no element specified, use the BODY. 
     # Otherwise find the element with given id.
     if self.element_id == "":
         html        = soup.find("body").renderContents()
     else:
         html        = str(soup.find(id=self.element_id))
     
     return html

示例#2

0

显示文件

文件： powerapple.py 项目： cylonbrain/FullTextRss

def handler(sock, url):
    htmlsource=sock.read()
    soup = BeautifulSoup(htmlsource)
    content = soup.find(id=re.compile("postmessage_\d+"),name="td")
    if content is None:
        return "failed to read content"
    return unicode(content)

示例#3

0

显示文件

def get_organic_data(html_data):
    bs = BeautifulSoup(str(html_data))
    div_filter = bs.find('div', {'id': 'ires'})
    if div_filter:
        contents = div_filter.findAll('li', {'class': 'g'})
        return contents
    return None

示例#4

0

显示文件

文件： Ticket.py 项目： zuojie/KKT

	def getRes(self):
		url = self.getResURL()
		page = urllib2.urlopen(url).read()#.decode('GBK').encode('utf-8')
		soup = BeautifulSoup(page)
		main_wrapper = soup.findAll('div', {'class': 'main_wrapper'})[0]
		#print main_wrapper.prettify()
		clr_after = main_wrapper.findAll('div', {'class': 'clr_after'})[0]
		#print clr_after.prettify()
		items = clr_after.findAll('div', {'class': 'main'})[0]
		#print items.prettify()
		items1 = items.findAll('div', {'class': 'lowpriceList'})[0]
		print items1.prettify().decode('utf-8').encode('gbk')
		items2 = items1.findAll('div', {'id': 'hdivResultTable'})[0]
		#print items2.prettify().decode('utf-8').encode('gbk')
		
		for item in items2:
			print item
			inc = str(item.findAll('td', {'class': 'col3'})[0].contents[0].string)
			fly_time = str(item.findAll('td', {'class': 'col4'})[0].contents[0].string)
			_time = str(item.findAll('td', {'class': 'col2'})[0].contents[0].string)
			_discount = str(item.findAll('span', {'class': 'disc'})[0].contents[0].string)
			_price = str(item.findAll('span', {'class': 'pr'})[0].contents[0].string)
			
			print inc#.decode('utf-8').encode('gbk')
			print fly_time#.decode('utf-8').encode('gbk')
			print _time#.decode('utf-8').encode('gbk')
			print _discount.decode('utf-8').encode('gbk')
			print _price.decode('utf-8').encode('gbk')

示例#5

0

显示文件

文件： bot.py 项目： bordanton/Youth

def fetch_trains(place_from, place_to, date):  
    key = 'trains_' + place_from + '_' + place_to + '_' + str(date)
    data = memcache.get(key) #@UndefinedVariable
    if data != None:
        return data
      
    params = {'fromName': place_from,
              'toName': place_to,
              'when': utils.date_serialize(date),
              'search_type': 'suburban'}
    url = 'http://m.rasp.yandex.ru/search?' + urllib.urlencode(params)
    response = urlfetch.fetch(url)
    html = response.content
    soup = BeautifulSoup(html)
    list_node = soup.find("ul", { "class" : "b-holster b-search-result" })
    if list_node != None:
        regex = re.compile(r'<.*?>')
        b_nodes = list_node.findAll("b")
        result = []
        for b_node in b_nodes:
            data = regex.split(b_node.renderContents())
            try:
                time = [datetime.datetime.strptime(x, '%H:%M').time() for x in data]
                result.append(TrainTiming(time[0], time[1]))
            except:
                pass
        memcache.add(key, result, 60*60)  #@UndefinedVariable
        return result

示例#6

0

显示文件

文件： models.py 项目： kahaeia1/a-plus

    def render(self):
        # TODO: fix and enable caching
        # content         =  cache.get(self.content_url)
        content = None

        url = self.content_url

        # If the page is not cached, retrieve it
        if content == None:
            opener      = urllib2.build_opener()
            content     = opener.open(url, timeout=5).read()
            
            # Save the page in cache
            # cache.set(self.content_url, content)
        
        soup            = BeautifulSoup(content)

        # TODO: Disabled. Add GET parameter support and enable.
        # Make links absolute, quoted from http://stackoverflow.com/a/4468467:
        #for tag in soup.findAll('a', href=True):
        #    tag['href'] = urlparse.urljoin(self.content_url, tag['href'])
        
        # If there's no element specified, use the BODY. 
        # Otherwise find the element with given id.
        if self.element_id == "":
            html        = soup.find("body").renderContents()
        else:
            html        = str(soup.find(id=self.element_id))
        
        return html

示例#7

0

显示文件

文件： test.py 项目： Xmister/rtlmost-xbmc

def get_episodes():
	"""docstring for get_episodes"""
	
	html = retrieve_url("http://www.rtlklub.hu/most/musorok/automania")
	soup = BeautifulSoup(html, fromEncoding="utf-8")
	print soup.originalEncoding
	episodesHtml = soup.findAll("div", attrs={"class" : "video-img-cont-catchup cont-first"})
	
	""" result
	
	<div class="video-img-cont-catchup cont-first" id="5217">
		<div class="video-date">okt 24.<span>12:15</span></div>
		<a href="http://www.rtlklub.hu/most/5217_automania_09-10-24" class="video-img">
			<img src="http://www.rtlklub.hu/most/files/thumbnails/005/217/2.jpg" width="120" height="90" alt="AutÃ³mÃ¡nia 09-10-24" title="AutÃ³mÃ¡nia 09-10-24" />
		</a>
		<a href="javascript:void(0)" class="video-add" id="5217-0">
			<img src="http://www.rtlklub.hu/most/style/img/add_video_icon.png" alt="Add a kedvenceid kÃ¶zÃ©" title="Add a kedvenceid kÃ¶zÃ©" />
		</a>
		<div class="img-height-wide"></div>
		<h2>
			<a href="http://www.rtlklub.hu/most/5217_automania_09-10-24">AutÃ³mÃ¡nia 09-10-24</a>
		</h2>
		<p>Toyota Prius, Aprilia Tuono 1000R, Honda Accord 2.2 I-DTEC</p>
	</div>
	
	"""
	
	episodes = []
	#print len(episodesHtml)
	for episode in episodesHtml:
		episodes.append({"title":episode.h2.a.string, "url":episode.h2.a['href'], "thumb":episode.a.img['src']})
	#print episodes
	return episodes

示例#8

0

显示文件

文件： parsers.py 项目： mshafrir/awardr

    def parse(property_id, ratecode='SPGCP'):
        valid_property = False
        hotel_props = {'id': property_id}

        property_url = "%s?propertyID=%s" % (starwood_url, property_id)
        logging.debug("Property URL: %s" % property_url)
        starwood_response = urlfetch.fetch(url=property_url, deadline=10)
        if starwood_response:
            try:
                soup = BeautifulSoup(starwood_response.content).find(attrs={'id': 'propertyHighlight'}).find(attrs={'class': 'propertyContainer'})
            except:
                soup = None

            if soup:
                try:
                    hotel_props['name'] = unicode(soup.find("a", "propertyName").contents[0]).strip()
                    hotel_props['category'] = int(str(soup.find("span", "spgCategory").contents[0]).split()[-1])

                    valid_property = True
                except:
                    pass

                if valid_property:
                    hotel_props['address'] = StarwoodParser.parse_address(soup)
                    #hotel_props['awards'] = StarwoodParser.parse_starwood(soup.find("div", "tabsContentContainer").findAll("div", "tabContent"))
                    hotel_props['image_url'] = str("http://www.starwoodhotels.com%s" % (soup.find("img", "propertyThumbnail")['src']))

        return valid_property and hotel_props or None

示例#9

0

显示文件

文件： exercise_page.py 项目： OpenDSA/OpenDSA-devserver

 def parse_response(self):
     soup                = BeautifulSoup(self.response)
     
     head                = soup.find("head")
     
     self.max_points     = int(_get_value_from_soup(head, "meta", "value", {"name": "max-points"}, 0))
     
     if _get_value_from_soup(head, "meta", "value", {"name": "status"}) == "accepted":
         self.is_accepted= True
     
     meta_title          = _get_value_from_soup(head, "meta", "content", {"name": "DC.Title"})
     if meta_title:
         self.meta["title"]  = meta_title
     else:
         title               = soup.find("title")
         if title:
             self.meta["title"]  = title.contents
     
     self.meta["description"] = _get_value_from_soup(head, "meta", "content", {"name": "DC.Description"}, "")
     
     points              = _get_value_from_soup(head, "meta", "value", {"name": "points"})
     if points != None:
         self.points     = int(points)
         self.is_graded  = True
         self.is_accepted= True
     
     exercise_div        = soup.body.find("div", {"id": "exercise"})
     
     if exercise_div != None:
         self.content    = exercise_div.renderContents()
     else:
         self.content    = soup.body.renderContents()

示例#10

0

显示文件

文件： kindlereader.py 项目： userid/kindlereader

    def parse_summary(self, summary, link):
        """处理文章"""

        soup = BeautifulSoup(summary)

        for span in list(soup.findAll(attrs={ "style" : "display: none;" })):
            span.extract()

        for attr in self.remove_attributes:
            for x in soup.findAll(attrs={attr:True}):
                del x[attr]

        for tag in soup.findAll(self.remove_tags):
            tag.extract()

        img_count = 0
        for img in list(soup.findAll('img')):
            if (self.max_image_number >= 0  and img_count >= self.max_image_number) \
                or img.has_key('src') is False \
                or img['src'].startswith("http://union.vancl.com/") \
                or img['src'].startswith("http://www1.feedsky.com/") \
                or img['src'].startswith("http://feed.feedsky.com/~flare/"):
                img.extract()
            else:
                try:
                    localimage = self.down_image(img['src'], link)

                    if localimage:
                        img['src'] = localimage
                        img_count = img_count + 1
                    else:
                        img.extract()
                except Exception, e:
                    print e
                    img.extract()

示例#11

0

显示文件

文件： webpage_splitter.py 项目： nava45/gpage_crawler

def get_organic_data(html_data):
    bs = BeautifulSoup(str(html_data))
    div_filter = bs.find('div',{'id':'ires'})
    if div_filter:
        contents = div_filter.findAll('li',{'class':'g'})
        return contents
    return None

示例#12

0

显示文件

文件： customtest.py 项目： poochin/feedbyselectors

    def seturl(self):
        '''URLとURLからフェッチして保存します'''
        user = common.currentuser()
        if not user:
            common.error(self, 404, "User not found.")
            return

        ct = models.CustomTest.all().ancestor(user).get()
        if not ct:
            ct = models.CustomTest(parent=user)

        ct.setbypost(self.request.POST)

        if not ct.rss_link:
            soup = Soup(defines.defaulttesthtml)
        else:
            result = urlfetch.fetch(ct.rss_link)
            if result.status_code != 200:
                common.error(self, 200, "Url Fetch Error")
                return
            soup = Soup(result.content)

        try: 
            ct.data = soup.prettify().decode('UTF-8')
        except ValueError, message:
            common.error(self, 200, message)
            return

示例#13

0

显示文件

文件： mitbbs.py 项目： cylonbrain/FullTextRss

def handler(sock, url):
    htmlsource=sock.read().decode('gb18030','replace').encode('utf-8') 
    soup = BeautifulSoup(htmlsource)
    content = soup.find("td",{"class":"jiawenzhang-type"})
    if content is None:
        return "content not found"
    return unicode(content)

示例#14

0

显示文件

文件： views.py 项目： joemarct/flask-gae-app

def view_page(slug):
    page = Page.gql("WHERE slug = :1", slug)[0]
    content = BeautifulSoup(page.content)
    codes = content.findAll('pre')
    for code in codes:
        code.contents[0].replaceWith(controllers.prettify_code(code.contents[0]))
    page.content = str(content)
    return render_template('cms_view_page.html', page=page)

示例#15

0

显示文件

文件： default.py 项目： jbeluch/xbmc-plugins

 def get_genres(self, url):
     """Return the available genres from the homepage."""
     html = download_page(url)
     ul_tags = BS(html, parseOnlyThese=SS('ul', {'class': 'menu'}))
     dirs = [{'name': a.span.string,
              'url': urljoin(self.base_url, a['href'] + '&limit=0'),
              'mode': '1'} for a in ul_tags.findAll('a')]
     self.add_dirs(dirs)

示例#16

0

显示文件

文件： spider.py 项目： knarfytrebil/MF-MANA

	def location(self,ip):
		try:
			self.current_page = self.br.open('http://www.114best.com/ip/114.aspx?w=%s' % ip)
		except Exception:
			return "Earth"
		soup = BeautifulSoup(self.current_page)
		lo = soup.findAll('div', { "id" : "output" })[0].findAll('b')[1].text.encode('utf-8','ignore')
		return lo

示例#17

0

显示文件

文件： Presentation.py 项目： zuojie/KKT

	def getPresentation(self):
		base_url = 'http://my.yingjiesheng.com/xuanjianghui_province_'
		for i in range(1, 35): #取出34[1-34]个省份的未来两天的招聘信息
			url = base_url + str(i) + '.html'
			#print url
			try:
				page = self.getRes(url) 
				soup = BeautifulSoup(page)
			except: #url打开失败
				continue
			#取出所有的倒计时
			try: #当前城市可能未来一段时间没有宣讲会信息
				countdowns = soup.findAll('div', {'class': 'list_topic'})
				y_m_d2, y_m_d3 = '', ''; #记录第二天和第三天的宣讲会日期
				first, second = -1, -1 #第二天和第三天的宣讲会出现的名字为campusTalk的table下标.其位置是和倒计时出现的div保持错开一个位置
				# 因为第0个名为campusTalk的table是表格标题栏，从第1个开始才是宣讲会的信息，因此day初始化为1
				day = 1 
				for countdown in countdowns:
					cd = string.atoi(countdown.contents[0].contents[2].string)
					if cd > 2: #倒计时超过2天的宣讲会，暂不考虑
						break
					elif cd == 1: #第二天要举行的宣讲会【倒计时剩1天】
						first = day
						y_m_d2 = countdown.contents[1].string
					elif cd == 2: #第三天要举行的宣讲会【倒计时剩2天】
						second = day
						y_m_d3 = countdown.contents[1].string
					day = day + 1
				# first是第2天信息，second是第三天的信息，假如为-1，表示那天没有宣讲会
				if first != -1:
					tables = soup.findAll('table', {'class':'campusTalk'})[first]
					trs = tables.findAll('tr')
					for tr in trs:
						tds = tr.findAll('td')
						city = tds[0].a.string.strip()
						school = tds[1].a.string.strip()
						addr = tds[2].string.strip()
						inc = tds[3].a.string.strip()
						try: # 有些宣讲会未标出具体开始时间[H-M-S]
							pdate = y_m_d2 + ' ' + tds[4].string
						except Exception, e:
							pdate = y_m_d2 #那么只记录年月日即可
						self.presentations.append(CPresentation(city, inc, school, pdate, addr))
				if second != -1:
					tables = soup.findAll('table', {'class':'campusTalk'})[second]
					trs = tables.findAll('tr')
					for tr in trs:
						tds = tr.findAll('td')
						city = tds[0].a.string.strip()
						school = tds[1].a.string.strip()
						addr = tds[2].string.strip()
						inc = tds[3].a.string.strip()
						try:
							pdate = y_m_d3 + ' ' + tds[4].string
						except:
							pdate = y_m_d3
						self.presentations.append(CPresentation(city, inc, school, pdate, addr))
			except:

示例#18

0

显示文件

文件： test_helpers.py 项目： JElbourne/PubCart

 def assert_no_error_message_in_response(self, response):
     """Check that response has no error messages."""
     soup = BeautifulSoup(response)
     el = soup.find("p", "alert-error")
     if el:
         self.fail("error message found in response unexpectedly: {}".format(el.contents))
     el = soup.findAll("label", "alert-error")
     if el:
         self.fail("error message found in response unexpectedly: {}".format(el.contents))

示例#19

0

显示文件

文件： base.py 项目： lovejoy/KindleEar

 def Items(self):
     itemsprocessed = []
     cnt4debug = 0
     opener = URLOpener(self.host)
     decoder = AutoDecoder()
     for section, url in self.feeds:
         content = None
         cnt4debug += 1
         if IsRunInLocal and cnt4debug > 1:
             break
         
         result = opener.open(url)
         status_code, content = result.status_code, result.content
         if status_code != 200 and content:
             logging.error('err(%d) to fetch %s.' % (status_code,url))
             continue
         
         if self.feed_encoding:
             content = content.decode(self.feed_encoding)
         else:
             content = decoder.decode(content)
         
         content = self.preprocess(content)
         
         feed = feedparser.parse(content)
         for e in feed['entries']:
             # 全文RSS中如果有广告或其他不需要的内容，可以在postprocess去掉
             desc = self.postprocess(e.description)
             desc = self.FragToXhtml(desc, e.title, self.feed_encoding)
             
             if self.keep_image:
                 soup = BeautifulSoup(content)
                 self.soupbeforeimage(soup)
                 for img in soup.findAll('img'):
                     imgurl = img['src']
                     if not imgurl.startswith('http') and not imgurl.startswith('www'):
                         imgurl = self.urljoin(url, imgurl)
                     imgresult = opener.open(imgurl)
                     imgcontent = imgresult.content if imgresult.status_code == 200 else None
                     if imgcontent:
                         imgtype = imghdr.what(None, imgcontent)
                         if imgtype:
                             imgmime = r"image/" + imgtype
                             if imgtype == 'jpeg':
                                 fnimg = "%d.jpg" % random.randint(10000,99999999)
                             else:
                                 fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype)
                             img['src'] = fnimg
                             yield (imgmime, imgurl, fnimg, imgcontent)
                 self.soupprocessex(soup)
                 desc = soup.renderContents('utf-8').decode('utf-8')
                 soup = None
             
             if e.title not in itemsprocessed and desc:
                 itemsprocessed.append(e.title)
                 yield (section, e.link, e.title, desc)

示例#20

0

显示文件

文件： views.py 项目： joemarct/flask-gae-app

def view_post(category_slug, post_slug):
    category = Category.gql("WHERE slug = :1", category_slug)[0]
    all_posts = Post.all().order('-date_created')
    post = [x for x in all_posts if x.category.slug == category_slug and x.slug == post_slug][0]
    content = BeautifulSoup(post.content)
    codes = content.findAll('pre')
    for code in codes:
        code.contents[0].replaceWith(controllers.prettify_code(code.contents[0]))
    post.content = unicode(content)
    return render_template('cms_view_post.html', post=post)

示例#21

0

显示文件

def fetchSong(url, viewCount):
    try:
        #Get song info from url
        songInfo = {}
        _get = url.split('?')[1]
        tokens = _get.split('&')
        for token in tokens:
            toks = token.split('=')
            songInfo[toks[0]] = int(toks[1])
        
        #fetch the html
        lyricsWeb = urllib2.urlopen(url)  
        webContent = lyricsWeb.read()  
        lyricsWeb.close()       
    
        soup = BeautifulSoup(webContent)
    
        lyrics = soup.findAll(id="mylrc")[0].contents
        author = soup.findAll(attrs={'class' : 'link_hb'})[0].contents[0]
        album = soup.findAll(attrs={'class' : 'link_hb'})[1].contents[0]
        title = soup.findAll(attrs={'class' : 'link_hb'})[2].contents[0]    
        
        #print lyrics
        lyricsText = ''
        for line in lyrics:
            for t in line:
                lyricsText += t                       
        
        #Construct the xml
        root = ET.Element("xml")
        doc = ET.SubElement(root, "doc")
        
        sidNode = ET.SubElement(doc, "sid")
        sidNode.text = str(songInfo[u'sid'])
        aidNode = ET.SubElement(doc, "aid")
        aidNode.text = str(songInfo[u'aid'])
        lidNode = ET.SubElement(doc, "lid")
        lidNode.text = str(songInfo[u'lid'])        
        titleNode = ET.SubElement(doc, "title")
        titleNode.text = title
        authorNode = ET.SubElement(doc, "author")
        authorNode.text = author
        viewCountNode = ET.SubElement(doc, "viewCount")
        viewCountNode.text = str(viewCount)
        lyricsNode = ET.SubElement(doc, "lyrics")
        lyricsNode.text = lyricsText
        
                       
        #Construct the tree
        tree = ET.ElementTree(root)
        filename = lyricsDbPath + str(songInfo['lid']) + ".txt"        
        tree.write(filename, "utf-8")
        
    except:
        pass

示例#22

0

显示文件

文件： app.py 项目： mshafrir/michaelshafrir.com

def get_refresh_url(page_content):
    try:
        page_soup = BeautifulSoup(page_content)
        for meta_tag in page_soup.findAll('meta'):
            if meta_tag['http-equiv'].lower() == 'refresh':
                refresh_url = meta_tag['content'].split('URL=')[1]
                return refresh_url
    except:
        pass

    return None

示例#23

0

显示文件

文件： test.py 项目： Xmister/rtlmost-xbmc

def get_shows():
	"""docstring for get_shows"""
	html = retrieve_url(BASE_URL)
	soup = BeautifulSoup(html, fromEncoding="utf-8")
	#print soup
	#print "Autómánia"
	showsHtml = soup.find(id="topnav04-ul").findAll("li")
	shows = []
	for show in showsHtml:
		shows.append({"title" : show.a.string, "url" : show.a['href']})
	return shows

示例#24

0

显示文件

文件： scraper.py 项目： thaohoang94/antplanner

def strip_professors(html, name):
	"""Returns list of professor matches"""
	profs = []
	
	table = BeautifulSoup(html).find('div', {'id': 'ratingTable'})
	if table is None:
		logging.debug(html[500:])
		return profs

	split = name[:-1].upper().split(',')
	qLast = split[0]
 	try:
		qFirst = split[1]
	except:
		qFirst = ''
			
	rows = table.findAll('div', {'class': re.compile(r"entry (odd|even)")})

	for row in rows:
		divName = row.find('div', {'class': 'profName'})
		anchor = divName.find('a')
		profName = unicode(anchor.renderContents().strip(), 'utf-8', 'ignore').upper()
		
		try:
			firstName = profName.split(',')[1]
		except:
			firstName = ''
			
		# logging.info('Searching against: ' + profName)
		
		if profName.startswith(qLast) and qFirst in firstName:						
			href = 'http://www.ratemyprofessors.com/' + anchor['href'].strip()
			profDept = row.find('div', {'class': 'profDept'}).renderContents().strip()
			profRatings = row.find('div', {'class': 'profRatings'}).renderContents().strip()
			profQuality = row.find('div', {'class': 'profAvg'}).renderContents().strip()
			profEasiness = row.find('div', {'class': 'profEasy'}).renderContents().strip()
			profHot = row.find('div', {'class': re.compile(r".*\bprofHot\b.*")}).renderContents().strip()
			
			if profHot == 'Hot':
				profHot = '&#x2713;'
			else:
				profHot = '&nbsp;'

			profs.append({
				'name': profName,
				'href': href,
				'dept': profDept,
				'ratings': profRatings,
				'quality': profQuality,
				'easiness': profEasiness,
				'hot': profHot
			})

	return profs

示例#25

0

显示文件

文件： adtaily2csv.py 项目： kosciak/kosciak-misc

def parse_page(writer, catalogue, page=1):
    print 'Parsing page %s' % page
    
    url = urllib.urlopen(URL % (catalogue, page))
    soup = BeautifulSoup(url)
    
    table = soup.find('table', attrs={'class': 'snippets'})
    for tr in table.findAll('tr'):
        # get name of the page
        name = tr.td.h4.a.string
        
        # get URL of the page
        url = tr.td.h4.a['href'].encode('utf-8')
        
        #get stats info
        stats = '?'
        stats_element = tr.find('p', attrs={'class': 'Stats'})
        if stats_element:
            stats = stats_element.strong.nextSibling.string[1:-11].replace(' ', '')
            if stats == 'wtrakc': 
                stats = '?'
        
        # get price
        price = tr.find('td', attrs={'class': 'Price'}).strong.string[0:-12]
        
        # calculate CPM
        cpm = '?'
        try:
            cpm = (float(price)*30) / int(stats) * 1000
        except:
            cpm = '?'
        
        # write to the file
        row = [name, url, stats, price.replace('.', ','), str(cpm).replace('.', ',')]
        print row
        writer.writerow(row)
    
    # find last page of the catalogue
    anchors = soup.findAll('a', href=re.compile('/networks/[0-9]+/websites\?page=[0-9]+'))
    if not anchors:
        return
    
    pages = []
    for anchor in anchors:
        number = re.match('/networks/[0-9]+/websites\?page=([0-9]+)', anchor['href']).group(1)
        pages.append(int(number))

    pages.sort()
    last = pages[-1]
    
    # parse next page if exists
    if last > page:
        next = page + 1
        parse_page(writer, catalogue, next)

示例#26

0

显示文件

文件： test_helpers.py 项目： JElbourne/PubCart

    def assert_warning_message_in_response(self, response, message=""):
        """Check if response contains one or more warning messages.

		Assume warning messages rendered as <p class="alert-warning"> elements.
		"""
        soup = BeautifulSoup(response)
        alert = soup.findAll("p", "alert-warning")
        self.assertGreater(len(alert), 0, "no warning message found in response")
        if message:
            found = str(alert[0]).find(message)
            self.assertGreater(found, 0)

示例#27

0

显示文件

文件： test_helpers.py 项目： JElbourne/PubCart

    def assert_has_div_with_ID(self, response, id_attr):
        """Check if response contains a Div with a particular ID attribute.

		<div id="<some-id>"> elements.
		"""
        soup = BeautifulSoup(response)
        alert = soup.findAll("div", id=id_attr)
        if alert:
            self.assertGreater(len(alert), 0, "No Div tag with, id=%s, in response" % str(id_attr))
        else:
            self.fail("No Div tag with, id=%s, in response" % str(id_attr))

示例#28

0

显示文件

文件： fantapy.py 项目： mshafrir/FantaPy

	def load(self):
		league_soup = BeautifulSoup(urllib2.urlopen(league_url).read())
		if league_soup:
			self.name = League.name(league_soup)
			self.mb = MessageBoard(self)
			
			team_rows = league_soup.find('table', attrs={'id': 'standingstable'}).tbody.findAll('tr')
			teams = [Team(self, team_id) for team_id in xrange(1,2)] # xrange(1, len(team_rows) + 1)]
			for team in teams:
				print "%s, %s, \"%s\" %s\n" % (team.name, team.record, team.smack, team.roster)
			'''

示例#29

0

显示文件

文件： WeatherReport2.py 项目： zuojie/KKT

	def getWeatherInfo(self, my_phone):
		for user in self.users:
			url = self.url + self.province_map[user.province.encode('gbk')] + '/' + self.city_map[user.city.encode('gbk')] + '.html' #构造查询URL
			#print url
			page = urllib2.urlopen(url).read().decode('GBK').encode('utf-8')
			soup = BeautifulSoup(page)
			#print page.decode('utf-8').encode('gbk')
			city_body = soup.find('div', {'class': 'w365border city_body'})
			weather_info = city_body.findAll('div', {'class': 'weather_div'})
			self.sendSMS(my_phone, weather_info[1], user) #明天的天气
			self.sendSMS(my_phone, weather_info[2], user) # 后天的天气

示例#30

0

显示文件

文件： fantapy.py 项目： mshafrir/FantaPy

	def __init__(self, league, team_id):
		team_url = "http://%s%s%s/%d?pak=%s" % (league.sport, YAHOO_FB, league.league_id, team_id, league.access_code)
		team_soup = BeautifulSoup(urllib2.urlopen(team_url).read()).find('div', attrs={'id': 'bd'})
		team_info_soup = team_soup.find('div', attrs={'id': 'teaminfo'})
		self.name = clean(team_info_soup.h1.em.contents[0])
		self.record = Team.parse_record(team_info_soup)
		try:
			self.smack = clean(team_info_soup.find('p', attrs={'id': 'smacktext'}).contents[0])
		except:
			self.smack = ''
		
		self.roster = Roster(league, team_id).players

示例#31

0

显示文件

文件： yahoo.py 项目： mshafrir/Rotoist

def league_settings(league_id, access_code):
	response = urlfetch.fetch("http://football.fantasysports.yahoo.com/f1/%s/settings?pak=%s" % (league_id, access_code))
	settings_table_soup = BeautifulSoup(response.content).find("table", attrs={'id': 'settings-table'})
	
	positions = defaultdict(int)
	for p in [str(s.strip()) for s in settings_table_soup.findAll('tr')[23].find('td', attrs={'width': '410'}).b.contents[0].split(',')]:
		positions[p] += 1
	
	#bench_spots = roster_positions.count('BN')
	
	return positions

示例#32

0

显示文件

文件： scraper.py 项目： thaohoang94/antplanner

def strip_search(html):
	form_html = BeautifulSoup(html).find('form', action='http://websoc.reg.uci.edu/')
	
	#replace form submit with our own link
	form_html['action'] = '/schedules'
	
	#remove 'Display Text Results' button
	text_buttons = form_html.findAll(attrs={"class" : "banner-width"})
	for i in text_buttons:
		i.replaceWith('<p id=\"submit-container\"><input type="submit" value="Display Results" name="Submit"></p>')
	
	return str(form_html)

示例#33

0

显示文件

 def get_script_urls(self, url, html):
     script_urls = []
     scripts = BeautifulSoup(html, parseOnlyThese=SoupStrainer('script'))
     for tag in scripts:
         if tag.has_key('src'):
             script_urls.append(self.get_absolute_url(url, tag['src']))
     return script_urls

示例#34

0

显示文件

 def find_video_links(self, html_message):
     soup = BeautifulSoup(html_message)
     embeds = soup('embed')
     tags = []
     for video in embeds:
         tags.append(db.Text(str(video)))
     return tags

示例#35

0

显示文件

 def find_image_links(self, html_message):
     soup = BeautifulSoup(html_message)
     images = soup('img')
     links = []
     for img in images:
         links.append(db.Link(img['src']))
     return links

示例#36

0

显示文件

def hyphenate_html(html, language='en-us', hyphenator=None, blacklist_tags= ('code', 'tt', 'pre', 'head', 'title', 'script', 'style', 'meta', 'object', 'embed', 'samp', 'var', 'math', 'select', 'option', 'input', 'textarea') ):
    r"""
    Hyphenate a fragement of HTML

    >>> hyphenate_html('<p>It is <em>beautiful</em> outside today!</p>')
    u'<p>It is <em>beau&shy;ti&shy;ful</em> out&shy;side today!</p>'

    >>> hyphenate_html('O paralelepipedo atrevessou a rua', 'pt-br')
    u'O pa&shy;ra&shy;le&shy;le&shy;pi&shy;pe&shy;do atre&shy;ves&shy;sou a rua'

    Content inside <code>, <tt>, and <pre> blocks is not hyphenated
    >>> hyphenate_html('Document: <code>document + page_status</code>')
    u'Doc&shy;u&shy;ment: <code>document + page_status</code>'

    Short words are not hyphenated

    >>> hyphenate_html("<p>The brave men, living and dead.</p>")
    u'<p>The brave men, liv&shy;ing and dead.</p>'
    """
    # Load hyphenator if one is not provided
    if not hyphenator:
        hyphenator = get_hyphenator_for_language(language)

    # Create HTML tree
    soup = BeautifulSoup(html)

    # Recursively hyphenate each element
    hyphenate_element(soup, hyphenator, blacklist_tags)

    return unicode(soup)

示例#37

0

显示文件

def updateprojectlist():
    print "updating the projects list"
    conn = httplib.HTTPConnection("android.git.kernel.org")
    conn.request("GET", "/")
    res = conn.getresponse()

    if res.status == httplib.OK:
        data = res.read()
        #print data
        conn.close()

        soup = BeautifulSoup(data)
        table = soup.body.table
        #print soup.body.table

        # filter
        tags = table.findAll('a',
                             attrs={
                                 'class': 'list',
                                 'title': None,
                                 'href': re.compile('^/\?p')
                             })
        #print tags

        projectlist = []
        for tag in tags:
            projectlist.append(tag.string)

        file = open(currentdir + "/" + listfilename, "w")
        #writelines won't add the '\n'
        file.writelines(map(lambda x: x.strip() + "\n", projectlist))
        file.close()

    else:
        print "fail to download the page: ", res.status, res.reason

示例#38

0

显示文件

def clawdata(data):
    data = urllib.urlencode(data)
    url = "http://www.powerball.com/powerball/pb_nbr_history.asp"

    response = urllib2.urlopen(url, data)
    soup = BeautifulSoup(response)

    for tag in soup.findAll(valign="middle"):
        csoup = BeautifulSoup(str(tag))
        dictIssue = dict()
        dictIssue["issueDate"] = ""
        dictIssue["luckNum"] = []
        if csoup.tr != None:
            for tag in csoup.tr.findAll('td'):
                if re.search("[0-9]+\/[0-9]+\/[0-9]{4}", str(tag.text)):
                    dictIssue["issueDate"] = str(tag.text)
                elif str(tag.text) != "&nbsp;":
                    dictIssue["luckNum"].append(int(tag.text))
            print dictIssue

示例#39

0

显示文件

文件： getStatisticsFromYT.py 项目： shuboc/WebMiningFinal

def getViewCount(songTitle):

    try:
        youtube = 'http://gdata.youtube.com/feeds/api/videos?v=2&max-results=1&q='
        #songTitle = urllib2.quote(songTitle)
        #print songTitle
        url = youtube + songTitle
        #print url

        web = urllib2.urlopen(url)
        content = web.read()
        web.close()

        soup = BeautifulSoup(content)
        stats = soup.findAll('yt:statistics')

        return int(stats[0]['viewcount'])

    except:
        return 0

示例#40

0

显示文件

def parse_organic_contents(raw_content, organic_pos):
    data_dict = {}
    data_dict['position'] = organic_pos

    b = BeautifulSoup(raw_content)
    rtitle = b.find('a')
    headline = p.sub('', str(rtitle))
    data_dict['title'] = headline

    display_url = parse_display_url(str(raw_content))
    data_dict['display_url'] = display_url

    rhref = b.find('a', href=True)
    url = str(rhref['href'])
    data_dict['url'] = ul.unquote(url)

    rtext = b.findAll('div', {'class': 's'})
    text = p.sub('', str(rtext))
    data_dict['text'] = text.replace(']', '').replace('[', '')
    return data_dict

示例#41

0

显示文件

文件： kindlereader.py 项目： hitsmaxft/kindlereader

    def parse_summary(self, summary, link):
        """处理文章"""

        soup = BeautifulSoup(summary)

        for span in list(soup.findAll(attrs={"style": "display: none;"})):
            span.extract()

        for attr in self.remove_attributes:
            for x in soup.findAll(attrs={attr: True}):
                del x[attr]

        for tag in soup.findAll(self.remove_tags):
            tag.extract()

        img_count = 0
        for img in list(soup.findAll('img')):
            if (self.max_image_number >= 0  and img_count >= self.max_image_number) \
                or img.has_key('src') is False \
                or img['src'].startswith("http://union.vancl.com/") \
                or img['src'].startswith("http://www1.feedsky.com/") \
                or img['src'].startswith("http://feed.feedsky.com/~flare/"):
                img.extract()
            else:
                try:
                    localimage = self.down_image(img['src'], link)

                    if localimage:
                        img['src'] = localimage
                        img_count = img_count + 1
                    else:
                        img.extract()
                except Exception, e:
                    print e
                    img.extract()

示例#42

0

显示文件

文件： exercise_page.py 项目： OpenDSA/OpenDSA-devserver

    def parse_response(self):
        soup = BeautifulSoup(self.response)

        head = soup.find("head")

        self.max_points = int(
            _get_value_from_soup(head, "meta", "value", {"name": "max-points"},
                                 0))

        if _get_value_from_soup(head, "meta", "value",
                                {"name": "status"}) == "accepted":
            self.is_accepted = True

        meta_title = _get_value_from_soup(head, "meta", "content",
                                          {"name": "DC.Title"})
        if meta_title:
            self.meta["title"] = meta_title
        else:
            title = soup.find("title")
            if title:
                self.meta["title"] = title.contents

        self.meta["description"] = _get_value_from_soup(
            head, "meta", "content", {"name": "DC.Description"}, "")

        points = _get_value_from_soup(head, "meta", "value",
                                      {"name": "points"})
        if points != None:
            self.points = int(points)
            self.is_graded = True
            self.is_accepted = True

        exercise_div = soup.body.find("div", {"id": "exercise"})

        if exercise_div != None:
            self.content = exercise_div.renderContents()
        else:
            self.content = soup.body.renderContents()

示例#43

0

显示文件

文件： kindlereader.py 项目： zyjia/kindlereader

    def parse_summary(self, summary, ref):
        """处理文章内容，去除多余标签并处理图片地址"""

        soup = BeautifulSoup(summary)

        for span in list(soup.findAll(attrs={"style": "display: none;"})):
            span.extract()

        for attr in self.remove_attributes:
            for x in soup.findAll(attrs={attr: True}):
                del x[attr]

        for tag in soup.findAll(self.remove_tags):
            tag.extract()

        img_count = 0
        images = []
        for img in list(soup.findAll('img')):
            if (krconfig.max_image_per_article >= 0  and img_count >= krconfig.max_image_per_article) \
                or img.has_key('src') is False :
                img.extract()
            else:
                try:
                    if img['src'].encode('utf-8').lower().endswith(
                        ('jpg', 'jpeg', 'gif', 'png', 'bmp')):
                        localimage, fullname = self.parse_image(img['src'])
                        # 确定结尾为图片后缀，防止下载非图片文件（如用于访问分析的假图片）
                        if os.path.isfile(fullname) is False:
                            images.append({
                                'url': img['src'],
                                'filename': fullname,
                                'referer': ref
                            })
                        if localimage:
                            img['src'] = localimage
                            img_count = img_count + 1
                        else:
                            img.extract()
                    else:
                        img.extract()
                except Exception, e:
                    logging.info("error: %s" % e)
                    img.extract()

示例#44

0

显示文件

def TPB(book=None, test=False):
    errmsg = ''
    provider = "TPB"
    host = lazylibrarian.CONFIG['TPB_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/s/?")

    cat = 0  # 601=ebooks, 102=audiobooks, 0=all, no mag category
    if 'library' in book:
        if book['library'] == 'AudioBook':
            cat = 102
        elif book['library'] == 'eBook':
            cat = 601
        elif book['library'] == 'magazine':
            cat = 0

    sterm = makeUnicode(book['searchterm'])

    page = 0
    results = []
    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    next_page = True

    while next_page:

        params = {
            "q": book['searchterm'],
            "category": cat,
            "page": page,
            "orderby": "99"
        }

        searchURL = providerurl + "?%s" % urllib.urlencode(params)

        next_page = False
        result, success = fetchURL(searchURL)

        if not success:
            # may return 404 if no results, not really an error
            if '404' in result:
                logger.debug("No results found from %s for %s" %
                             (provider, sterm))
                success = True
            else:
                logger.debug(searchURL)
                logger.debug('Error fetching data from %s: %s' %
                             (provider, result))
                errmsg = result
            result = False

        if test:
            return success

        if result:
            logger.debug('Parsing results from <a href="%s">%s</a>' %
                         (searchURL, provider))
            soup = BeautifulSoup(result)
            # tpb uses a named table
            table = soup.find('table', id='searchResult')
            if table:
                rows = table.findAll('tr')
            else:
                rows = []

            if len(rows) > 1:
                rows = rows[1:]  # first row is headers
            for row in rows:
                td = row.findAll('td')
                if len(td) > 2:
                    try:
                        title = unaccented(
                            str(td[1]).split('title=')[1].split('>')[1].split(
                                '<')[0])
                        magnet = str(td[1]).split('href="')[1].split('"')[0]
                        size = unaccented(
                            td[1].text.split(', Size ')[1].split('iB')[0])
                        size = size.replace('&nbsp;', '')
                        mult = 1
                        try:
                            if 'K' in size:
                                size = size.split('K')[0]
                                mult = 1024
                            elif 'M' in size:
                                size = size.split('M')[0]
                                mult = 1024 * 1024
                            elif 'G' in size:
                                size = size.split('G')[0]
                                mult = 1024 * 1024 * 1024
                            size = int(float(size) * mult)
                        except (ValueError, IndexError):
                            size = 0
                        try:
                            seeders = int(td[2].text)
                        except ValueError:
                            seeders = 0

                        if minimumseeders < int(seeders):
                            # no point in asking for magnet link if not enough seeders
                            magurl = '%s/%s' % (host, magnet)
                            result, success = fetchURL(magurl)
                            if not success:
                                logger.debug('Error fetching url %s, %s' %
                                             (magurl, result))
                            else:
                                magnet = None
                                new_soup = BeautifulSoup(result)
                                for link in new_soup.findAll('a'):
                                    output = link.get('href')
                                    if output and output.startswith('magnet'):
                                        magnet = output
                                        break
                            if not magnet or not title:
                                logger.debug('Missing magnet or title')
                            else:
                                results.append({
                                    'bookid':
                                    book['bookid'],
                                    'tor_prov':
                                    provider,
                                    'tor_title':
                                    title,
                                    'tor_url':
                                    magnet,
                                    'tor_size':
                                    str(size),
                                    'tor_type':
                                    'magnet',
                                    'priority':
                                    lazylibrarian.CONFIG['TPB_DLPRIORITY']
                                })
                                logger.debug('Found %s. Size: %s' %
                                             (title, size))
                                next_page = True
                        else:
                            logger.debug('Found %s but %s seeder%s' %
                                         (title, seeders, plural(seeders)))
                    except Exception as e:
                        logger.error("An error occurred in the %s parser: %s" %
                                     (provider, str(e)))
                        logger.debug('%s: %s' %
                                     (provider, traceback.format_exc()))

        page += 1
        if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page:
            logger.warn(
                'Maximum results page search reached, still more results available'
            )
            next_page = False

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))
    return results, errmsg

示例#45

0

显示文件

文件： torrentparser.py 项目： kuuratsanik/LazyLibrarian

def TPB(book=None):

    provider = "TPB"
    host = lazylibrarian.CONFIG['TPB_HOST']
    if not str(host)[:4] == "http":
        host = 'http://' + host

    providerurl = url_fix(host + "/s/?q=" + book['searchterm'])

    params = {"category": "601", "page": "0", "orderby": "99"}
    searchURL = providerurl + "&%s" % urllib.urlencode(params)

    result, success = fetchURL(searchURL)
    if not success:
        # may return 404 if no results, not really an error
        if '404' in result:
            logger.debug(u"No results found from %s for %s" %
                         (provider, book['searchterm']))
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' %
                         (provider, result))
        result = False

    results = []

    if result:
        logger.debug(u'Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
        soup = BeautifulSoup(result)
        try:
            table = soup.findAll('table')[0]
            rows = table.findAll('tr')
        except Exception:  # no results = no table in result page
            rows = []

        c1 = []
        c2 = []

        if len(rows) > 1:
            for row in rows[1:]:
                if len(row.findAll('td')) > 2:
                    c1.append(row.findAll('td')[1])
                    c2.append(row.findAll('td')[2])

        for col1, col2 in zip(c1, c2):
            try:
                title = unaccented(
                    str(col1).split('title=')[1].split('>')[1].split('<')[0])
                magnet = str(col1).split('href="')[1].split('"')[0]
                size = unaccented(col1.text.split(', Size ')[1].split('iB')[0])
                mult = 1
                try:
                    if 'K' in size:
                        size = size.split('K')[0]
                        mult = 1024
                    elif 'M' in size:
                        size = size.split('M')[0]
                        mult = 1024 * 1024
                    size = int(float(size) * mult)
                except (ValueError, IndexError):
                    size = 0
                try:
                    seeders = int(col2.text)
                except ValueError:
                    seeders = 0

                if minimumseeders < seeders:
                    # no point in asking for magnet link if not enough seeders
                    magurl = '%s/%s' % (host, magnet)
                    result, success = fetchURL(magurl)
                    if not success:
                        logger.debug('Error fetching url %s, %s' %
                                     (magurl, result))
                    else:
                        magnet = None
                        new_soup = BeautifulSoup(result)
                        for link in new_soup.findAll('a'):
                            output = link.get('href')
                            if output and output.startswith('magnet'):
                                magnet = output
                                break
                    if not magnet or not title:
                        logger.debug('Missing magnet or title')
                    else:
                        if minimumseeders < seeders:
                            results.append({
                                'bookid': book['bookid'],
                                'tor_prov': provider,
                                'tor_title': title,
                                'tor_url': magnet,
                                'tor_size': str(size),
                                'tor_type': 'magnet'
                            })
                            logger.debug('Found %s. Size: %s' % (title, size))
                        else:
                            logger.debug('Found %s but %s seeder%s' %
                                         (title, seeders, plural(seeders)))
                else:
                    logger.debug('Found %s but %s seeder%s' %
                                 (title, seeders, plural(seeders)))
            except Exception as e:
                logger.error(u"An error occurred in the %s parser: %s" %
                             (provider, str(e)))

    logger.debug(
        u"Found %i result%s from %s for %s" %
        (len(results), plural(len(results)), provider, book['searchterm']))
    return results

示例#46

0

显示文件

文件： torrentparser.py 项目： kuuratsanik/LazyLibrarian

def GEN(book=None):

    provider = "libgen"
    host = lazylibrarian.CONFIG['GEN_HOST']
    if not str(host)[:4] == "http":
        host = 'http://' + host

    searchURL = url_fix(
        host +
        "/search.php?view=simple&open=0&phrase=0&column=def&res=100&req=" +
        book['searchterm'])

    result, success = fetchURL(searchURL)
    if not success:
        # may return 404 if no results, not really an error
        if '404' in result:
            logger.debug(u"No results found from %s for %s" %
                         (provider, book['searchterm']))
        elif '111' in result:
            # looks like libgen has ip based access limits
            logger.error(
                'Access forbidden. Please wait a while before trying %s again.'
                % provider)
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' %
                         (provider, result))
        result = False

    results = []

    if result:
        logger.debug(u'Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        soup = BeautifulSoup(result)
        try:
            table = soup.findAll('table')[2]
            rows = table.findAll('tr')
        except Exception:  # no results = no table in result page
            rows = []

        c1 = []
        c2 = []
        c7 = []
        c8 = []

        if len(rows) > 1:
            for row in rows[1:]:
                if len(row.findAll('td')) > 8:
                    c1.append(row.findAll('td')[1])
                    c2.append(row.findAll('td')[2])
                    c7.append(row.findAll('td')[7])
                    c8.append(row.findAll('td')[8])

        for col1, col2, col7, col8 in zip(c1, c2, c7, c8):
            try:
                author = unaccented(col1.text)
                title = unaccented(
                    str(col2).split('>')[2].split('<')[0].strip())
                link = str(col2).split('href="')[1].split('?')[1].split('"')[0]
                size = unaccented(col7.text).upper()
                extn = col8.text

                try:
                    mult = 1
                    if 'K' in size:
                        size = size.split('K')[0]
                        mult = 1024
                    elif 'M' in size:
                        size = size.split('M')[0]
                        mult = 1024 * 1024
                    size = int(float(size) * mult)
                except (ValueError, IndexError):
                    size = 0

                if link and title:
                    if author:
                        title = author.strip() + ' ' + title.strip()
                    if extn:
                        title = title + '.' + extn

                    bookURL = url_fix(host + "/ads.php?" + link)
                    bookresult, success = fetchURL(bookURL)
                    if not success:
                        # may return 404 if no results, not really an error
                        if '404' in bookresult:
                            logger.debug(u"No results found from %s for %s" %
                                         (provider, book['searchterm']))
                        else:
                            logger.debug(bookURL)
                            logger.debug('Error fetching data from %s: %s' %
                                         (provider, bookresult))
                        bookresult = False
                    if bookresult:
                        url = None
                        new_soup = BeautifulSoup(bookresult)
                        for link in new_soup.findAll('a'):
                            output = link.get('href')
                            if output and output.startswith('/get.php'):
                                url = output
                                break

                        if url:
                            url = url_fix(host + url)
                            results.append({
                                'bookid': book['bookid'],
                                'tor_prov': provider,
                                'tor_title': title,
                                'tor_url': url,
                                'tor_size': str(size),
                                'tor_type': 'direct'
                            })
                            logger.debug('Found %s, Size %s' % (title, size))

            except Exception as e:
                logger.error(u"An error occurred in the %s parser: %s" %
                             (provider, str(e)))

    logger.debug(
        u"Found %i result%s from %s for %s" %
        (len(results), plural(len(results)), provider, book['searchterm']))
    return results

示例#47

0

显示文件

文件： kindlereader.py 项目： zyjia/kindlereader

    def makelocal(self, feed_data, feed_idx, force_full_text=0):
        '''生成解析结果'''
        global updated_feeds
        global feedlock

        try:
            local = {
                'idx': feed_idx,
                'entries': [],
                'title': feed_data.feed['title'],
            }

            item_idx = 1
            for entry in feed_data.entries:
                if item_idx > krconfig.max_items_number:
                    break

                try:
                    published_datetime = datetime(*entry.published_parsed[0:6])
                except:
                    published_datetime = self.parsetime(entry.published)

                if datetime.utcnow(
                ) - published_datetime > krconfig.max_old_date:
                    break

                try:
                    local_author = entry.author
                except:
                    local_author = "null"

                local_entry = {
                    'idx':
                    item_idx,
                    'title':
                    entry.title,
                    'published':
                    (published_datetime +
                     krconfig.timezone).strftime("%Y-%m-%d %H:%M:%S"),
                    'url':
                    entry.link,
                    'author':
                    local_author,
                }

                if force_full_text:
                    local_entry['content'], images = self.force_full_text(
                        entry.link)
                else:
                    try:
                        local_entry['content'], images = self.parse_summary(
                            entry.content[0].value, entry.link)
                    except:
                        local_entry['content'], images = self.parse_summary(
                            entry.summary, entry.link)

                local_entry['stripped'] = ''.join(
                    BeautifulSoup(
                        local_entry['content'],
                        convertEntities=BeautifulSoup.HTML_ENTITIES).findAll(
                            text=True))[:200]

                local['entries'].append(local_entry)
                for i in images:
                    imgq.put(i)
                item_idx += 1

            if len(local['entries']) > 0:
                if feedlock.acquire():
                    updated_feeds.append(local)
                    feedlock.release()
                else:
                    feedlock.release()
                logging.info("from feed{} update {} items.".format(
                    feed_idx, len(local['entries'])))
            else:
                logging.info("feed{} has no update.".format(feed_idx))
        except Exception, e:
            logging.error("fail(feed{}): {}".format(feed_idx, e))

示例#48

0

显示文件

def TDL(book=None):

    provider = "torrentdownloads"
    host = lazylibrarian.TDL_HOST
    if not str(host)[:4] == "http":
        host = 'http://' + host

    providerurl = url_fix(host)

    params = {"type": "search", "cid": "2", "search": book['searchterm']}
    searchURL = providerurl + "/rss.xml?%s" % urllib.urlencode(params)

    try:
        request = urllib2.Request(searchURL)
        if lazylibrarian.PROXY_HOST:
            request.set_proxy(lazylibrarian.PROXY_HOST,
                              lazylibrarian.PROXY_TYPE)
        request.add_header('User-Agent', USER_AGENT)
        data = urllib2.urlopen(request, timeout=90)
    except (socket.timeout) as e:
        logger.debug('Timeout fetching data from %s' % provider)
        data = False
    except (urllib2.HTTPError, urllib2.URLError, ssl.SSLError) as e:
        # may return 404 if no results, not really an error
        if hasattr(e, 'code') and e.code == 404:
            logger.debug(searchURL)
            logger.debug(u"No results found from %s for %s" %
                         (provider, book['searchterm']))
        else:
            logger.debug(searchURL)
            if hasattr(e, 'reason'):
                errmsg = e.reason
            else:
                errmsg = str(e)
            logger.debug('Error fetching data from %s: %s' %
                         (provider, errmsg))
        data = False

    results = []

    minimumseeders = int(lazylibrarian.NUMBEROFSEEDERS) - 1
    if data:
        logger.debug(u'Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        d = feedparser.parse(data)
        if len(d.entries):
            for item in d.entries:
                try:
                    title = item['title']
                    seeders = int(item['seeders'])
                    link = item['link']
                    size = int(item['size'])
                    url = None

                    if link and minimumseeders < seeders:
                        # no point requesting the magnet link if not enough seeders
                        request = urllib2.Request(link)
                        if lazylibrarian.PROXY_HOST:
                            request.set_proxy(lazylibrarian.PROXY_HOST,
                                              lazylibrarian.PROXY_TYPE)
                        request.add_header('User-Agent', USER_AGENT)

                        conn = urllib2.urlopen(request, timeout=90)
                        result = conn.read()
                        url = None
                        new_soup = BeautifulSoup(result)
                        for link in new_soup.findAll('a'):
                            output = link.get('href')
                            if output and output.startswith('magnet'):
                                url = output
                                break

                    if minimumseeders < int(seeders):
                        if not url or not title:
                            logger.debug('Missing url or title')
                        else:
                            results.append({
                                'bookid': book['bookid'],
                                'tor_prov': provider,
                                'tor_title': title,
                                'tor_url': url,
                                'tor_size': str(size),
                            })
                            logger.debug('Found %s. Size: %s' % (title, size))
                    else:
                        logger.debug('Found %s but %s seeder%s' %
                                     (title, seeders, plural(seeders)))

                except Exception as e:
                    logger.error(u"An error occurred in the %s parser: %s" %
                                 (provider, str(e)))

    logger.debug(
        u"Found %i result%s from %s for %s" %
        (len(results), plural(len(results)), provider, book['searchterm']))
    return results

示例#49

0

显示文件

def view():		
	addon_handle = int(sys.argv[1])
	addon       = xbmcaddon.Addon()
	addonname   = addon.getAddonInfo('name')
	
	args = urlparse.parse_qs(sys.argv[2][1:])

	xbmcplugin.setContent(addon_handle, 'movies')

	cat=args.get('cat', None)
	page = args.get('page', None)
	link = args.get('link', None)	
	
	catalogues=[{'label':'\x56\x69\x64\x65\x6F\x20\x4D\xE1\xBB\x9B\x69'.decode('utf-8'),'id':'video/new/'},
				{'label':'Video Hot','id':'video/hot/'}]
	#play link
	if link!=None:
		link_video=link[0]
		if link_video.startswith(web_url):
			r = requests.get(link[0])
			html = r.text
			#xbmc.log(html.encode('utf-8'))
			soup = BeautifulSoup(html)
			video_src=soup.find('embed', attrs={'id':'zplayer'})
			video_flashvars=video_src.get('flashvars')
			args_video = urlparse.parse_qs(video_flashvars)
			link_video=args_video['file'][0]					
		xbmc.Player().play(link_video)
		return
	#Load cats
	if cat==None:
		for cat in catalogues:
			li = xbmcgui.ListItem(cat['label'])
			urlList = CMDTools.build_url(base_url,{'web':get_Web_Name(), 'cat':cat['id']})
			xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li, isFolder=True)	
		xbmc.executebuiltin('Container.SetViewMode(501)')		 			
		xbmcplugin.endOfDirectory(addon_handle)
		return
	#Load noi dung cat
	if cat!=None:
		if page==None:
			page=1
		else:
			page=int(page[0])
		r = requests.get(web_url+cat[0]+str(page))
		html = r.text		
		xbmc.log(html.encode('utf-8'))
		soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)			
		data_List=soup.findAll('a',attrs={'class':'play'})
		#load item menu
		for item in data_List:			
			link_item=web_url+item.get('href')			
			if item.get('data-youtubeid')!='':
				link_item="plugin://plugin.video.youtube/play/?video_id="+item.get('data-youtubeid')
			img_item=item.find('img')
			img_src=img_item.get('src')
			img_alt=img_item.get('alt')
			
			li = xbmcgui.ListItem(img_alt)
			
			li.setThumbnailImage(img_src)
			li.setInfo(type='image',infoLabels="")					
			
			urlList = CMDTools.build_url(base_url,{'web':get_Web_Name(), 'link':link_item, 'type':cat[0]})
			xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li)			
		
		#Tao nut next	
		li = xbmcgui.ListItem("Next")	
		urlList=CMDTools.build_url(base_url,{'web':web_name, 'cat':cat[0],'page': page+1});
		xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li, isFolder=True)	
		
		xbmc.executebuiltin('Container.SetViewMode(501)')
		#xbmc.executebuiltin("ClearSlideshow")		
		#xbmc.executebuiltin("SlideShow(,,notrandom)")		
		xbmcplugin.endOfDirectory(addon_handle)
		return
					
	xbmcplugin.endOfDirectory(addon_handle)

示例#50

0

显示文件

def soup(string, **kwargs):
    """Create a BeautifulSoup parse object from a string"""
    from lib.BeautifulSoup import BeautifulSoup    
    return BeautifulSoup(string, **kwargs)

示例#51

0

显示文件

def GEN(book=None, prov=None):
    errmsg = ''
    provider = "libgen.io"
    if prov is None:
        prov = 'GEN'
    host = lazylibrarian.CONFIG[prov + '_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    search = lazylibrarian.CONFIG[prov + '_SEARCH']
    if not search or not search.endswith('.php'):
        search = 'search.php'
    if 'index.php' not in search and 'search.php' not in search:
        search = 'search.php'
    if search[0] == '/':
        search = search[1:]

    page = 1
    results = []
    next_page = True

    while next_page:
        if 'index.php' in search:
            params = {
                "s": book['searchterm'],
                "f_lang": "All",
                "f_columns": 0,
                "f_ext": "All"
            }
        else:
            params = {
                "view": "simple",
                "open": 0,
                "phrase": 0,
                "column": "def",
                "res": 100,
                "req": book['searchterm']
            }

        if page > 1:
            params['page'] = page

        providerurl = url_fix(host + "/%s" % search)
        searchURL = providerurl + "?%s" % urllib.urlencode(params)

        next_page = False
        result, success = fetchURL(searchURL)
        if not success:
            # may return 404 if no results, not really an error
            if '404' in result:
                logger.debug(u"No results found from %s for %s" %
                             (provider, book['searchterm']))
            elif '111' in result:
                # looks like libgen has ip based access limits
                logger.error(
                    'Access forbidden. Please wait a while before trying %s again.'
                    % provider)
                errmsg = result
            else:
                logger.debug(searchURL)
                logger.debug('Error fetching page data from %s: %s' %
                             (provider, result))
                errmsg = result
            result = False

        if result:
            logger.debug(u'Parsing results from <a href="%s">%s</a>' %
                         (searchURL, provider))
            try:
                soup = BeautifulSoup(result)
                try:
                    table = soup.findAll('table')[2]  # un-named table
                    if table:
                        rows = table.findAll('tr')
                except IndexError:  # no results table in result page
                    rows = []

                if 'search.php' in search and len(rows) > 1:
                    rows = rows[1:]

                for row in rows:
                    author = ''
                    title = ''
                    size = ''
                    extn = ''
                    link = ''
                    td = row.findAll('td')
                    if 'index.php' in search and len(td) > 3:
                        try:
                            res = str(
                                BeautifulStoneSoup(
                                    td[0].text,
                                    convertEntities=BeautifulStoneSoup.
                                    HTML_ENTITIES))
                            author = formatAuthorName(res)
                            title = str(
                                BeautifulStoneSoup(
                                    td[2].text,
                                    convertEntities=BeautifulStoneSoup.
                                    HTML_ENTITIES))
                            temp = str(td[4])
                            temp = temp.split('onmouseout')[1]
                            extn = temp.split('">')[1].split('(')[0]
                            size = temp.split('">')[1].split('(')[1].split(
                                ')')[0]
                            size = size.upper()
                            link = temp.split('href=')[1].split('"')[1]
                        except IndexError as e:
                            logger.debug(
                                'Error parsing libgen index.php results: %s' %
                                str(e))

                    elif 'search.php' in search and len(td) > 8:
                        try:
                            res = str(
                                BeautifulStoneSoup(
                                    td[1].text,
                                    convertEntities=BeautifulStoneSoup.
                                    HTML_ENTITIES))
                            author = formatAuthorName(res)
                            title = str(
                                td[2]).split('>')[2].split('<')[0].strip()
                            title = str(
                                BeautifulStoneSoup(
                                    title,
                                    convertEntities=BeautifulStoneSoup.
                                    HTML_ENTITIES))
                            link = str(td[2]).split('href="')[1].split(
                                '?')[1].split('"')[0]
                            size = unaccented(td[7].text).upper()
                            extn = td[8].text
                        except IndexError as e:
                            logger.debug(
                                'Error parsing libgen search.php results; %s' %
                                str(e))

                    if not size:
                        size = 0
                    else:
                        try:
                            mult = 1
                            if 'K' in size:
                                size = size.split('K')[0]
                                mult = 1024
                            elif 'M' in size:
                                size = size.split('M')[0]
                                mult = 1024 * 1024
                            elif 'G' in size:
                                size = size.split('G')[0]
                                mult = 1024 * 1024 * 1024
                            size = int(float(size) * mult)
                        except (ValueError, IndexError):
                            size = 0

                    if link and title:
                        if author:
                            title = author.strip() + ' ' + title.strip()
                        if extn:
                            title = title + '.' + extn

                        if not link.startswith('http'):
                            if "/ads.php?" in link:
                                url = url_fix(host + link)
                            else:
                                url = url_fix(host + "/ads.php?" + link)
                        else:
                            url = redirect_url(host, link)

                        bookresult, success = fetchURL(url)
                        if not success:
                            # may return 404 if no results, not really an error
                            if '404' in bookresult:
                                logger.debug(
                                    u"No results found from %s for %s" %
                                    (provider, book['searchterm']))
                            else:
                                logger.debug(url)
                                logger.debug(
                                    'Error fetching link data from %s: %s' %
                                    (provider, bookresult))
                                errmsg = bookresult
                            bookresult = False

                        if bookresult:
                            url = None
                            try:
                                new_soup = BeautifulSoup(bookresult)
                                for link in new_soup.findAll('a'):
                                    output = link.get('href')
                                    if output:
                                        if output.startswith(
                                                'http'
                                        ) and '/get.php' in output:
                                            url = output
                                            break
                                        elif '/get.php' in output:
                                            url = '/get.php' + output.split(
                                                '/get.php')[1]
                                            break
                                        elif '/download/book' in output:
                                            url = '/download/book' + output.split(
                                                '/download/book')[1]
                                            break

                                if url and not url.startswith('http'):
                                    url = url_fix(host + url)
                                else:
                                    url = redirect_url(host, url)
                            except Exception as e:
                                logger.debug(
                                    'Error parsing bookresult for %s: %s' %
                                    (link, str(e)))
                                url = None

                        if url:
                            results.append({
                                'bookid':
                                book['bookid'],
                                'tor_prov':
                                provider + '/' + search,
                                'tor_title':
                                title,
                                'tor_url':
                                url,
                                'tor_size':
                                str(size),
                                'tor_type':
                                'direct',
                                'priority':
                                lazylibrarian.CONFIG[prov + '_DLPRIORITY']
                            })
                            logger.debug('Found %s, Size %s' % (title, size))
                        next_page = True

            except Exception as e:
                logger.error(u"An error occurred in the %s parser: %s" %
                             (provider, str(e)))
                logger.debug('%s: %s' % (provider, traceback.format_exc()))

        page += 1
        if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page:
            logger.warn(
                'Maximum results page search reached, still more results available'
            )
            next_page = False

    logger.debug(
        u"Found %i result%s from %s for %s" %
        (len(results), plural(len(results)), provider, book['searchterm']))
    return results, errmsg

示例#52

0

显示文件

文件： torrentparser.py 项目： kuuratsanik/LazyLibrarian

def KAT(book=None):

    provider = "KAT"
    host = lazylibrarian.CONFIG['KAT_HOST']
    if not str(host)[:4] == "http":
        host = 'http://' + host

    providerurl = url_fix(host + "/usearch/" + book['searchterm'])

    params = {"category": "books", "field": "seeders", "sorder": "desc"}
    searchURL = providerurl + "/?%s" % urllib.urlencode(params)

    result, success = fetchURL(searchURL)
    if not success:
        # seems KAT returns 404 if no results, not really an error
        if '404' in result:
            logger.debug(u"No results found from %s for %s" %
                         (provider, book['searchterm']))
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' %
                         (provider, result))
        result = False

    results = []

    if result:
        logger.debug(u'Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
        soup = BeautifulSoup(result)

        try:
            table = soup.findAll('table')[1]
            rows = table.findAll('tr')
        except Exception:  # no results = no table in result page
            rows = []

        c0 = []
        c1 = []
        c3 = []

        if len(rows) > 1:
            for row in rows[1:]:
                if len(row.findAll('td')) > 3:
                    c0.append(row.findAll('td')[0])
                    c1.append(row.findAll('td')[1])
                    c3.append(row.findAll('td')[3])

        for col0, col1, col3 in zip(c0, c1, c3):
            try:
                title = unaccented(
                    str(col0).split('cellMainLink">')[1].split('<')[0])
                # kat can return magnet or torrent or both.
                magnet = ''
                url = ''
                mode = 'torrent'
                try:
                    magnet = 'magnet' + str(col0).split(
                        'href="magnet')[1].split('"')[0]
                    mode = 'magnet'
                except IndexError:
                    pass
                try:
                    url = 'http' + str(col0).split('href="http')[1].split(
                        '.torrent?')[0] + '.torrent'
                    mode = 'torrent'
                except IndexError:
                    pass

                if not url or (magnet and url
                               and lazylibrarian.CONFIG['PREFER_MAGNET']):
                    url = magnet
                    mode = 'magnet'

                try:
                    size = str(col1.text).replace('&nbsp;', '').upper()
                    mult = 1
                    if 'K' in size:
                        size = size.split('K')[0]
                        mult = 1024
                    elif 'M' in size:
                        size = size.split('M')[0]
                        mult = 1024 * 1024
                    size = int(float(size) * mult)
                except (ValueError, IndexError):
                    size = 0
                try:
                    seeders = int(col3.text)
                except ValueError:
                    seeders = 0

                if not url or not title:
                    logger.debug('Missing url or title')
                elif minimumseeders < seeders:
                    results.append({
                        'bookid': book['bookid'],
                        'tor_prov': provider,
                        'tor_title': title,
                        'tor_url': url,
                        'tor_size': str(size),
                        'tor_type': mode
                    })
                    logger.debug('Found %s. Size: %s' % (title, size))
                else:
                    logger.debug('Found %s but %s seeder%s' %
                                 (title, seeders, plural(seeders)))
            except Exception as e:
                logger.error(u"An error occurred in the %s parser: %s" %
                             (provider, str(e)))

    logger.debug(
        u"Found %i result%s from %s for %s" %
        (len(results), plural(len(results)), provider, book['searchterm']))
    return results

示例#53

0

显示文件

文件： DetermineMinSDK.py 项目： ziv0chou/qark

def determine_min_sdk():
    """
	Determines the minimum SDK version supported by the vulnerable application\n
	As a fallback, it allows the user to search Google PlayStore to identify the minimum SDK version if the data is unavailable in manifest.xml
	"""
    #determine minimum supported versions
    common.minSdkVersion = 0
    common.sdk = common.xmldoc.getElementsByTagName("uses-sdk")
    determineSdk = ''

    if len(common.sdk) > 0:
        if 'android:minSdkVersion' in common.sdk[0].attributes.keys():
            try:
                common.minSdkVersion = common.sdk[0].attributes[
                    'android:minSdkVersion'].value
                logger.info(
                    common.config.get('qarkhelper', 'MIN_SDK_VERSION') +
                    str(common.minSdkVersion))
            except Exception as e:
                common.logger.error(
                    "Something went wrong trying to determine the version from the manifest: "
                    + str(e))

    if common.minSdkVersion == 0:
        if common.source_or_apk == 2:
            common.minSdkVersion = find_gradle()
            if common.minSdkVersion == 0:
                common.logger.info(
                    "We were unable to find the minimum SDK version in your source."
                )
                determineSdk = 'm'
            else:
                logger.info(
                    common.config.get('qarkhelper', 'MIN_SDK_VERSION') +
                    str(common.minSdkVersion))
        else:
            common.compare(common.sdk.length, 1,
                           common.config.get('qarkhelper', 'USESDK_MISS'),
                           'false')
            print common.config.get('qarkhelper', 'GEN_OUTPUT_WARN')
            while True:
                determineSdk = raw_input(
                    "Which option would you prefer? (P)lay, (M)anual")
                if determineSdk.lower() in ('p', 'm'):
                    break
                else:
                    determineSdk = raw_input("Please enter either (p) or (m):")

        if determineSdk.lower() == 'p':
            #get package name from manifest if possible
            #make call to Play store
            #determine API version from https://play.google.com/store/apps/details?id=<package name>
            # will need to adjust the sdk[0] value for the checks below
            for a in common.xmldoc.getElementsByTagName('manifest'):
                if 'package' in a.attributes.keys():
                    print common.config.get('qarkhelper', 'PACK_FOUND')
                    package_name = a.attributes['package'].value
                    print package_name
                else:
                    package_name = raw_input(
                        common.config.get('qarkhelper', 'NO_PACK_NAME'))

            try:
                logger.info(
                    common.config.get('qarkhelper', 'DETERMINING_SDK_VERSION'))
                play_url = "https://play.google.com/store/apps/details?id="
                play_url += package_name
                print play_url
                page = urllib2.urlopen(play_url)
                html = BeautifulSoup(page.read())
                play_version = html.find(itemprop="operatingSystems")
                plat_version = re.findall('\d+.\d+', play_version.contents[0])
                if plat_version:
                    plat_version = [str(item) for item in plat_version]
                    api_plat_map = []
                    api_plat_map.append(['1', '1.0'])
                    api_plat_map.append(['2', '1.1'])
                    api_plat_map.append(['3', '1.5'])
                    api_plat_map.append(['4', '1.6'])
                    api_plat_map.append(['5', '2.0'])
                    api_plat_map.append(['6', '2.0.1'])
                    api_plat_map.append(['7', '2.1'])
                    api_plat_map.append(['8', '2.2'])
                    api_plat_map.append(['9', '2.3'])
                    api_plat_map.append(['10', '2.3.3'])
                    api_plat_map.append(['11', '3.0'])
                    api_plat_map.append(['12', '3.1'])
                    api_plat_map.append(['13', '3.2'])
                    api_plat_map.append(['14', '4.0'])
                    api_plat_map.append(['15', '4.0.3'])
                    api_plat_map.append(['16', '4.1'])
                    api_plat_map.append(['17', '4.2'])
                    api_plat_map.append(
                        ['18', '4.3']
                    )  #Webviews have critical vuln, no more patches from Google
                    api_plat_map.append(['19', '4.4'])
                    api_plat_map.append(
                        ['20', '4.4']
                    )  # This is actually 4.4W, a wearable only build, I'm assuming it is the same as 4.4 for our purposes
                    api_plat_map.append(['21', '5.0'])
                    api_plat_map.append(
                        ['22', '5.1']
                    )  # This is latest version, we'll assume this for newer, until update
                    #TODO - double check this, adding 5.1 may have broken it
                    for a in api_plat_map:
                        if StrictVersion(str(
                                plat_version[0])) >= StrictVersion(str(a[1])):
                            common.minSdkVersion = a[0]
                    logger.info(
                        common.config.get('qarkhelper', 'MIN_SDK_VERSION') +
                        str(common.minSdkVersion))
                    manual = raw_input(
                        common.config.get('qarkhelper', 'SDK_VALUE_MANUAL'))
                else:
                    print common.config.get('qarkhelper', 'CANT_DET_PLAY')
                    #BUG - not processing the cases of wanting to enter if manually, if the retrieval of the play version is broken
            except HTTPError, e:
                print str(e)
                logger.error(
                    common.config.get('qarkhelper',
                                      'MIN_SDK_PLAY_STORE_FAILED'))
        elif (determineSdk.lower() == 'm' or common.minSdkVersion == 0):
            #does not actually become 1, just needs a value, since it wasn't found, so we assume worst case
            print common.term.cyan + common.term.bold + str(
                common.config.get('qarkhelper', 'NO_MIN_SDK')).decode(
                    'string-escape').format(t=common.term)
            enterSdk = raw_input(
                common.config.get('qarkhelper', 'PROMPT_MIN_SDK'))
            if enterSdk.lower() == 'y':
                sdkinput = 0
                while True:
                    sdkinput = int(
                        raw_input(
                            common.config.get('qarkhelper', 'PROMPT_VER') +
                            common.config.get('qarkhelper', 'MAX_API_VERSION')
                            + common.config.get('qarkhelper', 'PROMPT_VER2')))
                    if 0 < int(sdkinput) <= int(
                            common.config.get('qarkhelper',
                                              'MAX_API_VERSION')):
                        common.minSdkVersion = int(sdkinput)
                        break
            else:
                common.minSdkVersion = 7

示例#54

0

显示文件

 def get_charset_from_html(self, html):
     return BeautifulSoup(html).originalEncoding

示例#55

0

显示文件

def KAT(book=None, test=False):
    errmsg = ''
    provider = "KAT"
    host = lazylibrarian.CONFIG['KAT_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/usearch/" +
                          urllib.quote(book['searchterm']))

    params = {"category": "books", "field": "seeders", "sorder": "desc"}
    searchURL = providerurl + "/?%s" % urllib.urlencode(params)

    sterm = makeUnicode(book['searchterm'])

    result, success = fetchURL(searchURL)
    if not success:
        # seems KAT returns 404 if no results, not really an error
        if '404' in result:
            logger.debug("No results found from %s for %s" % (provider, sterm))
            success = True
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' %
                         (provider, result))
            errmsg = result
        result = False

    if test:
        return success

    results = []

    if result:
        logger.debug('Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
        soup = BeautifulSoup(result)
        rows = []
        try:
            table = soup.findAll('table')[1]  # un-named table
            if table:
                rows = table.findAll('tr')
        except IndexError:  # no results table in result page
            rows = []

        if len(rows) > 1:
            rows = rows[1:]  # first row is headers

        for row in rows:
            td = row.findAll('td')
            if len(td) > 3:
                try:
                    title = unaccented(
                        str(td[0]).split('cellMainLink">')[1].split('<')[0])
                    # kat can return magnet or torrent or both.
                    magnet = ''
                    url = ''
                    mode = 'torrent'
                    try:
                        magnet = 'magnet' + str(
                            td[0]).split('href="magnet')[1].split('"')[0]
                        mode = 'magnet'
                    except IndexError:
                        pass
                    try:
                        url = 'http' + str(td[0]).split('href="http')[1].split(
                            '.torrent?')[0] + '.torrent'
                        mode = 'torrent'
                    except IndexError:
                        pass

                    if not url or (magnet and url
                                   and lazylibrarian.CONFIG['PREFER_MAGNET']):
                        url = magnet
                        mode = 'magnet'

                    try:
                        size = str(td[1].text).replace('&nbsp;', '').upper()
                        mult = 1
                        if 'K' in size:
                            size = size.split('K')[0]
                            mult = 1024
                        elif 'M' in size:
                            size = size.split('M')[0]
                            mult = 1024 * 1024
                        elif 'G' in size:
                            size = size.split('G')[0]
                            mult = 1024 * 1024 * 1024
                        size = int(float(size) * mult)
                    except (ValueError, IndexError):
                        size = 0
                    try:
                        seeders = int(td[3].text)
                    except ValueError:
                        seeders = 0

                    if not url or not title:
                        logger.debug('Missing url or title')
                    elif minimumseeders < int(seeders):
                        results.append({
                            'bookid':
                            book['bookid'],
                            'tor_prov':
                            provider,
                            'tor_title':
                            title,
                            'tor_url':
                            url,
                            'tor_size':
                            str(size),
                            'tor_type':
                            mode,
                            'priority':
                            lazylibrarian.CONFIG['KAT_DLPRIORITY']
                        })
                        logger.debug('Found %s. Size: %s' % (title, size))
                    else:
                        logger.debug('Found %s but %s seeder%s' %
                                     (title, seeders, plural(seeders)))
                except Exception as e:
                    logger.error("An error occurred in the %s parser: %s" %
                                 (provider, str(e)))
                    logger.debug('%s: %s' % (provider, traceback.format_exc()))

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))
    return results, errmsg

示例#56

0

显示文件

    except:
        pass

########################start of main###################################

for i in range(startId, endId):
    
    url = "http://lyrics.oiktv.com/singer.php?sid=" + str(i)

    #lyricsWeb = urllib2.urlopen("http://lyrics.oiktv.com/singer.php?sid=51")  
    lyricsWeb = urllib2.urlopen(url)
    
    webContent = lyricsWeb.read()  
    lyricsWeb.close()  
    
    soup = BeautifulSoup(webContent)
    
    pages = soup.findAll('a')
    wantedPages = []
    for page in pages:        
        if re.search("&page=", page['href']):
            #print page['href']
            url = page['href']
            wantedPages.append(url)
            
    if len(wantedPages) > 1: #find those who has more than 20 albums    
        
        maxPageNum = 1 #Max 1 page for each singer
        pageNum = 0
        maxSongNum = 250
        songNum = 0

示例#57

0

显示文件

def TDL(book=None, test=False):
    errmsg = ''
    provider = "torrentdownloads"
    host = lazylibrarian.CONFIG['TDL_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host)

    params = {"type": "search", "cid": "2", "search": book['searchterm']}
    searchURL = providerurl + "/rss.xml?%s" % urllib.urlencode(params)

    sterm = makeUnicode(book['searchterm'])

    data, success = fetchURL(searchURL)
    if not success:
        # may return 404 if no results, not really an error
        if '404' in data:
            logger.debug("No results found from %s for %s" % (provider, sterm))
            success = True
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' % (provider, data))
            errmsg = data
        data = False

    if test:
        return success

    results = []

    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    if data:
        logger.debug('Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        d = feedparser.parse(data)
        if len(d.entries):
            for item in d.entries:
                try:
                    title = item['title']
                    seeders = int(item['seeders'])
                    link = item['link']
                    size = int(item['size'])
                    url = None

                    if link and minimumseeders < int(seeders):
                        # no point requesting the magnet link if not enough seeders
                        # TDL gives us a relative link
                        result, success = fetchURL(providerurl + link)
                        if success:
                            new_soup = BeautifulSoup(result)
                            for link in new_soup.findAll('a'):
                                output = link.get('href')
                                if output and output.startswith('magnet'):
                                    url = output
                                    break

                        if not url or not title:
                            logger.debug('Missing url or title')
                        else:
                            results.append({
                                'bookid':
                                book['bookid'],
                                'tor_prov':
                                provider,
                                'tor_title':
                                title,
                                'tor_url':
                                url,
                                'tor_size':
                                str(size),
                                'tor_type':
                                'magnet',
                                'priority':
                                lazylibrarian.CONFIG['TDL_DLPRIORITY']
                            })
                            logger.debug('Found %s. Size: %s' % (title, size))
                    else:
                        logger.debug('Found %s but %s seeder%s' %
                                     (title, seeders, plural(seeders)))

                except Exception as e:
                    logger.error("An error occurred in the %s parser: %s" %
                                 (provider, str(e)))
                    logger.debug('%s: %s' % (provider, traceback.format_exc()))

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))

    return results, errmsg

示例#58

0

显示文件

def WWT(book=None, test=False):
    errmsg = ''
    provider = "WorldWideTorrents"
    host = lazylibrarian.CONFIG['WWT_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/torrents-search.php")

    sterm = makeUnicode(book['searchterm'])

    cat = 0  # 0=all, 36=ebooks, 52=mags, 56=audiobooks
    if 'library' in book:
        if book['library'] == 'AudioBook':
            cat = 56
        elif book['library'] == 'eBook':
            cat = 36
        elif book['library'] == 'magazine':
            cat = 52

    page = 0
    results = []
    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    next_page = True

    while next_page:
        params = {"search": book['searchterm'], "page": page, "cat": cat}
        searchURL = providerurl + "/?%s" % urllib.urlencode(params)

        next_page = False
        result, success = fetchURL(searchURL)
        if not success:
            # might return 404 if no results, not really an error
            if '404' in result:
                logger.debug("No results found from %s for %s" %
                             (provider, sterm))
                success = True
            else:
                logger.debug(searchURL)
                logger.debug('Error fetching data from %s: %s' %
                             (provider, result))
                errmsg = result
            result = False

        if test:
            return success

        if result:
            logger.debug('Parsing results from <a href="%s">%s</a>' %
                         (searchURL, provider))
            soup = BeautifulSoup(result)

            try:
                tables = soup.findAll('table')  # un-named table
                table = tables[2]
                if table:
                    rows = table.findAll('tr')
            except IndexError:  # no results table in result page
                rows = []

            if len(rows) > 1:
                rows = rows[1:]  # first row is headers

            for row in rows:
                td = row.findAll('td')
                if len(td) > 3:
                    try:
                        title = unaccented(
                            str(td[0]).split('title="')[1].split('"')[0])
                        # can return magnet or torrent or both.
                        magnet = ''
                        url = ''
                        mode = 'torrent'
                        try:
                            magnet = 'magnet' + str(
                                td[0]).split('href="magnet')[1].split('"')[0]
                            mode = 'magnet'
                        except IndexError:
                            pass
                        try:
                            url = url_fix(host + '/download.php') + \
                                          str(td[0]).split('href="download.php')[1].split('.torrent"')[0] + '.torrent'
                            mode = 'torrent'
                        except IndexError:
                            pass

                        if not url or (magnet and url and
                                       lazylibrarian.CONFIG['PREFER_MAGNET']):
                            url = magnet
                            mode = 'magnet'

                        try:
                            size = str(td[1].text).replace('&nbsp;',
                                                           '').upper()
                            mult = 1
                            if 'K' in size:
                                size = size.split('K')[0]
                                mult = 1024
                            elif 'M' in size:
                                size = size.split('M')[0]
                                mult = 1024 * 1024
                            elif 'G' in size:
                                size = size.split('G')[0]
                                mult = 1024 * 1024 * 1024
                            size = int(float(size) * mult)
                        except (ValueError, IndexError):
                            size = 0
                        try:
                            seeders = int(td[2].text)
                        except ValueError:
                            seeders = 0

                        if not url or not title:
                            logger.debug('Missing url or title')
                        elif minimumseeders < int(seeders):
                            results.append({
                                'bookid':
                                book['bookid'],
                                'tor_prov':
                                provider,
                                'tor_title':
                                title,
                                'tor_url':
                                url,
                                'tor_size':
                                str(size),
                                'tor_type':
                                mode,
                                'priority':
                                lazylibrarian.CONFIG['WWT_DLPRIORITY']
                            })
                            logger.debug('Found %s. Size: %s' % (title, size))
                            next_page = True
                        else:
                            logger.debug('Found %s but %s seeder%s' %
                                         (title, seeders, plural(seeders)))
                    except Exception as e:
                        logger.error("An error occurred in the %s parser: %s" %
                                     (provider, str(e)))
                        logger.debug('%s: %s' %
                                     (provider, traceback.format_exc()))
        page += 1
        if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page:
            logger.warn(
                'Maximum results page search reached, still more results available'
            )
            next_page = False

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))
    return results, errmsg

示例#59

0

显示文件

文件： android_source.py 项目： freebendy/python

prefixurl = "https://android.git.kernel.org/" # "git://android.git.kernel.org/" is so easy to lead to time out
currentdir = os.path.abspath(os.path.dirname(sys.argv[0])) #the dir of the source
repositorydir = ".git"
os.chdir(currentdir) # change the work directory, getcwd()

conn = httplib.HTTPConnection("android.git.kernel.org")
conn.request("GET","/")
res = conn.getresponse()

if res.status == httplib.OK:
    data = res.read();
    #print data
    conn.close()
    
    soup = BeautifulSoup(data)
    #print soup.prettify()
    table = soup.body.table
    #print soup.body.table
    # filter
    tags = table.findAll('a', attrs = {'class' : 'list', 'title': None , 'href' : re.compile('^/\?p')})
    #print tags
    projectlist = []
    for tag in tags:
        projectlist.append(tag.string) 
        
    file = open(currentdir+"/list.txt","w")
    #writelines won't add the '\n'
    file.writelines( map( lambda x: x.strip()+"\n", projectlist ) );
    file.close()