Python BeautifulSoup示例，beautifulsoup.BeautifulSoup Python示例

示例#1

0

显示文件

def DoSoupFindAll(data, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs):
    try:
        soup = beautifulsoup.BeautifulSoup(data)
        return soup.findAll(name, attrs, recursive, text, limit, **kwargs)
    except:
        logFile.debug("Error parsing using soup", exc_info=True)
        return []

示例#2

0

显示文件

文件： nzbsanity.py 项目： JnrnZEDb/NZBMegaSearch

    def getdetailednzbinfo(self, data):

        filesegs = []
        fileinfo = {}
        fileinfo['pars'] = 0
        fileinfo['rars'] = 0
        fileinfo['nfo'] = 0
        fileinfo['nofile'] = 0
        fileinfo['nbytes'] = 0
        fileinfo['postid'] = []

        if (len(data) == 0):
            return fileinfo

        soup = beautifulsoup.BeautifulSoup(data)
        fileno = soup.findAll('file')

        for fno in fileno:
            try:
                segs = fno.findAll('segments')
                fsggs = 0
                parfile = 0
                #~ there is no rar or rarparts chk, many nzb contain just uncompr.files
                val_sample = re.search(r"[\.\-]sample", fno['subject'], re.I)
                if (val_sample is not None):
                    continue
                if (fno['subject'].find('.nfo') != -1):
                    fileinfo['nfo'] = fileinfo['nfo'] + 1
                elif (fno['subject'].find('.par2') != -1):
                    fileinfo['pars'] = fileinfo['pars'] + 1
                    parfile = 1
                else:
                    fileinfo['nofile'] = fileinfo['nofile'] + 1

                for s in segs:
                    s_segs = s.findAll('segment')
                    fsggs = fsggs + len(s_segs)
                    postid = []
                    for s2 in s_segs:
                        fileinfo['nbytes'] += int(s2['bytes'])
            except Exception as e:
                fileinfo['pars'] = 0
                fileinfo['rars'] = 0
                fileinfo['nfo'] = 0
                fileinfo['nofile'] = 0
                fileinfo['nbytes'] = 0
                fileinfo['postid'] = []

                log.critical("Error, could not parse NZB file")
                #~ sys.exit()

        fileinfo['nbytes'] = int(fileinfo['nbytes'] / (1024 * 1024))

        #~ print 'Num files: ' + str(fileinfo['nofile']) + ' of which repair files ' + str(fileinfo['pars'])
        return fileinfo

示例#3

0

显示文件

文件： DeepsearchModule.py 项目： JnrnZEDb/NZBMegaSearch

    def get_profile_info(self):
        socket.setdefaulttimeout(self.timeout)
        if (self.chkcookie() == False):
            if (self.dologin() == False):
                return []

        loginurl = self.cur_cfg['url'] + "/profile"
        try:
            socket.setdefaulttimeout(self.timeout)
            res = self.br.open(loginurl)
        except Exception as e:
            eret = self.mech_error_generic(e)
            if (eret == 302):
                self.reset_cookies()
            return []

        data = res.get_data()
        soup = beautifulsoup.BeautifulSoup(data)

        info = {}
        for row in soup.findAll("tr"):
            data = {}
            #~ print row
            #~ print '--------'
            allTHs = row.findAll("th")
            for x in range(len(allTHs)):
                str_lowcase = str(allTHs[x]).lower()
                if (str_lowcase.find('api hits today') > -1):
                    allTD = row.findAll("td")
                    if (len(allTD)):
                        info['api_hits'] = ''.join(allTD[0].findAll(text=True))

                if (str_lowcase.find('grabs today') > -1):
                    allTD = row.findAll("td")
                    if (len(allTD)):
                        info['grabs_today'] = ''.join(
                            allTD[0].findAll(text=True))
                if (str_lowcase.find('grabs total') > -1
                        or str_lowcase.find('grabs') > -1):
                    allTD = row.findAll("td")
                    if (len(allTD)):
                        info['grabs_total'] = ''.join(
                            allTD[0].findAll(text=True))

        #~ print info
        return info

示例#4

0

显示文件

文件： get_links.py 项目： lambdalisue/django-qwert

def get_links(value):
    """
    Returns links found in an (X)HTML string as Python objects for itteration in templates.
    
    EXAMPLE:
    
    <ul>
      {% for link in blog.entry.body|get_links %}
         <li><a href="{{ link.href }}">{{ link.title }}</a></li>
      {% endfor %}
    </ul>
    
    """
    try:
        import beautifulsoup
    except ImportError:
        if settings.DEBUG:
            raise template.TemplateSyntaxError, "Error in {% getlinks %} filter: The Python BeautifulSoup and/or urllib2 libraries aren't installed."
        return value
    else:
        soup = beautifulsoup.BeautifulSoup(value)
        return soup.findAll('a')

示例#5

0

显示文件

文件： DeepsearchModule.py 项目： JnrnZEDb/NZBMegaSearch

    def search(self, srchstr):
        if (self.cur_cfg['valid'] == 0):
            return []

        socket.setdefaulttimeout(self.timeout)

        self.cur_cfg['retcode'] = self.default_retcode
        if (self.chkcookie() == False):
            if (self.dologin() == False):
                return []

        mainurl = self.cur_cfg['url']

        #~ category must have asterisk, it messes up the search
        srchstrnu = srchstr.split('.')

        for defcats in self.definedcat:
            if (defcats[0] == srchstrnu[-1]):
                srchstrnu[-1] = '*' + srchstrnu[-1]
                srchstr = ".".join(srchstrnu)

        loginurl = mainurl + '/nzbbrowse.php?b=2&st=1&c=0&g=0&sr=2&o=0&k=' + srchstr
        timestamp_s = time.time()

        try:
            socket.setdefaulttimeout(self.timeout)
            res = self.br.open(loginurl)
        except Exception as e:
            self.mech_error_generic(e)
            eret = self.mech_error_generic(e)
            if (eret == 302):
                self.reset_cookies()
            return []

        data = res.get_data()
        timestamp_e = time.time()
        log.info('TS ' + mainurl + " " + str(timestamp_e - timestamp_s))
        self.cur_cfg['retcode'][2] = timestamp_e - timestamp_s

        #~ def searchDBG(self, srchstr):
        #~ handler = open('test.html').read()
        soup = beautifulsoup.BeautifulSoup(data)

        parsed_data = []
        titlesdiv = soup.findAll('div', {'class': 'pstnam'})
        nzburlsdiv = soup.findAll('div', {'class': 'dlnzb'})
        tstampdiv = soup.findAll('div', {'class': 'pstdat'})
        szdiv = soup.findAll('abbr', {'title': 'Total size of articles'})
        catdiv = soup.findAll('a', {'class': 'catimg'})

        titles = []
        rdetails = []
        nzburls = []
        tstamp = []
        bytesize = []
        categr = []

        for tl in catdiv:
            fall_tt = tl['title'].find('Show all in: ')
            if (fall_tt != -1):
                categr.append(tl['title'][fall_tt + 13:])
            else:
                categr = []
                break

        for tl in titlesdiv:
            all_a = tl.findAll("a")
            titles.append(''.join(all_a[0].findAll(text=True)))
            rdetails.append(all_a[0]['href'][1:])

        for tl in nzburlsdiv:
            all_a = tl.findAll("a")
            nzburls.append(all_a[0]['href'][1:])

        #~ absolute day of posting
        for tl in tstampdiv:
            intage = int(tl.findAll(text=True)[0].split()[0].split('.')[0])
            today = datetime.datetime.now()
            dd = datetime.timedelta(days=intage)
            earlier = today - dd
            tstamp.append(time.mktime(earlier.timetuple()))

        for sz1 in szdiv:
            for sz2 in sz1.findAll(text=True):
                sz2s = sz2.split()
                if (len(sz2s) == 2):
                    if (sz2s[1].lower() == 'mb'):
                        bytesize.append(
                            int(self.basic_sz *
                                float(sz2s[0].replace(',', ''))))
                    if (sz2s[1].lower() == 'gb'):
                        bytesize.append(
                            int(self.basic_sz *
                                float(sz2s[0].replace(',', '')) * 1024))

        if (len(titles) != len(nzburls)):
            return []
        if (len(titles) != len(tstamp)):
            return []
        if (len(titles) != len(rdetails)):
            return []
        if (len(titles) != len(bytesize)):
            return []
        if (len(categr) != len(titles)):
            categr = []

        for i in xrange(len(titles)):
            category_found = {}
            if (len(categr)):
                category_found[categr[i]] = 1
            else:
                category_found['N/A'] = 1

            d1 = {
                'title': titles[i],
                'poster': 'poster',
                'size': bytesize[i],
                'url': self.baseURL + nzburls[i],
                'filelist_preview': '',
                'group': 'N/A',
                'posting_date_timestamp': tstamp[i],
                'release_comments': self.baseURL + rdetails[i],
                'categ': category_found,
                'ignore': 0,
                'req_pwd': self.typesrch,
                'provider': self.baseURL,
                'providertitle': self.name
            }
            #~ print d1
            parsed_data.append(d1)
        return parsed_data

示例#6

0

显示文件

文件： DeepsearchModule.py 项目： JnrnZEDb/NZBMegaSearch

    def search_raw(self, pagestr, srchstr):
        if (self.cur_cfg['valid'] == 0):
            return []

        socket.setdefaulttimeout(self.timeout)

        #~ WIN: it seems to have issue in win32
        # locale.setlocale( locale.LC_ALL, 'en_US.utf8' )

        self.cur_cfg['retcode'] = self.default_retcode

        if (self.chkcookie() == False):
            if (self.dologin() == False):
                return []
        mainurl = self.cur_cfg['url']
        loginurl = mainurl + pagestr + srchstr
        timestamp_s = time.time()
        try:
            socket.setdefaulttimeout(self.timeout)
            res = self.br.open(loginurl)
        except Exception as e:
            eret = self.mech_error_generic(e)
            if (eret == 302):
                self.reset_cookies()
            return []
        data = res.get_data()
        timestamp_e = time.time()
        log.info('TS ' + mainurl + " " + str(timestamp_e - timestamp_s))
        self.cur_cfg['retcode'][2] = timestamp_e - timestamp_s

        soup = beautifulsoup.BeautifulSoup(data)

        #~ def searchDBG(self, srchstr):
        #~ handler = open('tmp/tater.html').read()
        #~ soup = BeautifulSoup (handler)

        parsed_data = []
        titles = soup.findAll('a', {'class': 'title'})
        nzburls = soup.findAll('a', {'title': 'Download Nzb'})
        tstamp_raw = soup.findAll('td', {'class': 'less mid'})
        rdetails = soup.findAll('a', {'title': 'View details'})
        sz_raw = soup.findAll('td', {'class': 'less right'})
        catname_raw = soup.findAll('td', {'class': 'less'})

        catname = []
        for catn in catname_raw:
            catcont = catn.findAll(text=True)
            for catn1 in catcont:
                catcont_idx = catn1.find('">')
                if (catcont_idx != -1):
                    catname.append(catn1[catcont_idx + 2:len(catn1)].replace(
                        '>', '-').capitalize())

        bytesize = []
        for sz1 in sz_raw:
            #~ rawline = str(sz1).split()
            for sz2 in sz1.findAll(text=True):
                sz2s = sz2.split()

                if (len(sz2s) == 2):
                    #~ print sz2s[1].lower()
                    if (sz2s[1].lower() == 'mb'):
                        bytesize.append(
                            int(self.basic_sz *
                                float(sz2s[0].replace(',', ''))))
                    if (sz2s[1].lower() == 'gb'):
                        bytesize.append(
                            int(self.basic_sz *
                                float(sz2s[0].replace(',', '')) * 1024))
        #~ print bytesize

        #~ 2010-05-08 18:53:09
        tstamp = []
        for tt in tstamp_raw:
            for tt2 in tt.attrs:
                #~ print tt2[1]
                if ('title' in tt2):
                    #~ print tt2[1]
                    tstamp.append(
                        time.mktime(
                            datetime.datetime.strptime(
                                tt2[1], "%Y-%m-%d %H:%M:%S").timetuple()))
                    break

        #~ deep debug
        #~ print 'tit' + str(len(titles))
        #~ print 'tst' + str(len(tstamp)	)
        #~ print 'url' + str(len(nzburls))
        #~ print 'det' + str(len(rdetails))
        #~ print 'sz' + str(len(bytesize))
        #~ print 'cat' + str(len(catname))

        skipts = 1
        if (len(titles)):
            if (len(tstamp) % len(titles) == 0):
                skipts = len(tstamp) / len(titles)
                if (skipts < 1):
                    return []

        if (len(titles) != len(nzburls)):
            return []
        #~ if(len(titles) != len(tstamp)):
        #~ return []
        if (len(titles) != len(rdetails)):
            return []
        if (len(titles) != len(bytesize)):
            return []

        for i in xrange(len(titles)):
            category_found = {}

            if (len(catname) == len(titles)):
                category_found[catname[i]] = 1
            else:
                category_found['N/A'] = 1

            d1 = {
                'title': ''.join(titles[i].findAll(text=True)),
                'poster': 'poster',
                'size': bytesize[i],
                'url': self.baseURL + '/' + nzburls[i]['href'],
                'filelist_preview': '',
                'group': 'N/A',
                'posting_date_timestamp': tstamp[i * skipts],
                'release_comments': self.baseURL + rdetails[i]['href'],
                'categ': category_found,
                'ignore': 0,
                'req_pwd': self.typesrch,
                'provider': self.baseURL,
                'providertitle': self.name
            }
            #~ print d1
            parsed_data.append(d1)

        return parsed_data

示例#7

0

显示文件

文件： DeepsearchModule.py 项目： menage/usntssearch

	def search(self, srchstr):
		if(self.cur_cfg['valid'] == 0):
			return []
		
		socket.setdefaulttimeout(self.timeout)

		if	(self.chkcookie() == False):
			if(self.dologin() == False):
				return []
						
		mainurl = self.cur_cfg['url']
		#~ https://www.gingadaddy.com/nzbbrowse.php?b=2&st=1&k=dog&c=0&g=0&sr=2&o=0
		
		loginurl = mainurl + '/nzbbrowse.php?b=2&st=1&c=0&g=0&sr=2&o=0&k='+srchstr
		timestamp_s = time.time()	
		
		try:
			socket.setdefaulttimeout(self.timeout)
			res = self.br.open(loginurl)
		except Exception as e:
			self.mech_error_generic(e)
			eret = self.mech_error_generic(e)
			if(eret == 302):
				self.reset_cookies()
			return []	

		data = res.get_data()  
		timestamp_e = time.time()
		log.info('TS ' + mainurl + " " + str(timestamp_e - timestamp_s))
		

		#~ def searchDBG(self, srchstr):
		#~ handler = open('test.html').read()
		soup = beautifulsoup.BeautifulSoup(data)

		parsed_data = []
		titlesdiv = soup.findAll('div', {'class': 'pstnam'})
		nzburlsdiv = soup.findAll('div', {'class': 'dlnzb'})
		tstampdiv = soup.findAll('div', {'class': 'pstdat'})
		szdiv =  soup.findAll('abbr', {'title': 'Total size of articles'})

		titles = []
		rdetails = []
		nzburls = []
		tstamp = []
		bytesize = []
		
		for tl in titlesdiv:
			all_a = tl.findAll("a")
			titles.append(''.join(all_a[0].findAll(text=True)))
			rdetails.append(all_a[0]['href'][1:])

		for tl in nzburlsdiv:
			all_a = tl.findAll("a")
			nzburls.append(all_a[0]['href'][1:])

		#~ absolute day of posting
		for tl in tstampdiv:
			intage =  int(tl.findAll(text=True)[0].split()[0].split('.')[0])
			today = datetime.datetime.now()
			dd = datetime.timedelta(days=intage)
			earlier = today - dd
			tstamp.append(time.mktime(earlier.timetuple()))

		for sz1 in szdiv:
			for sz2 in sz1.findAll(text=True):
				sz2s =  sz2.split()
 				if(len(sz2s) == 2):
					if (sz2s[1].lower() == 'mb' ):
						bytesize.append( int(self.basic_sz * float(sz2s[0].replace(',', '')) ) )
					if (sz2s[1].lower() == 'gb' ):
						bytesize.append( int(self.basic_sz * float(sz2s[0].replace(',', '')) * 1024) )
 
		if(len(titles) != len(nzburls)):
			return []
		if(len(titles) != len(tstamp)):
			return []
		if(len(titles) != len(rdetails)):
			return []
		if(len(titles) != len(bytesize)):
			return []
			

		for i in xrange(len(titles)):
			d1 = {
				'title': titles[i],
				'poster': 'poster',
				'size': bytesize[i],
				'url': self.baseURL + nzburls[i],
				'filelist_preview': '',
				'group': 'N/A',
				'posting_date_timestamp': tstamp[i],
				'release_comments': self.baseURL + rdetails[i],
				'categ':{'N/A':1},
				'ignore':0,
				'req_pwd':self.typesrch,
				'provider':self.baseURL,
				'providertitle':self.name
			}
			#~ print d1
			parsed_data.append(d1)
		return parsed_data

示例#8

0

显示文件

文件： nzbchecker_server.py 项目： pillone/nzbstrongverify

    def nzb_getinfo(self, data):

        h = HTMLParser.HTMLParser()

        soup = beautifulsoup.BeautifulSoup(data)
        fileno = soup.findAll('file')

        filesegs = []
        fileinfo = {}
        fileinfo['pars'] = 0
        fileinfo['nfo'] = 0
        fileinfo['nofile'] = 0
        fileinfo['rar'] = 0
        fileinfo['nzb'] = 0
        fileinfo['sfv'] = 0
        fileinfo['postid'] = []
        allfiles_LUT = {}
        allfiles = {}

        rootfile = ''
        nbytes = 0

        for fno in fileno:
            #~ try:
            #~ print fno['subject']
            segs = fno.findAll('segments')
            groups = fno.findAll('groups')
            fsggs = 0
            parfile = 0
            typefile = self.MSGTYPE_ARCHIVE

            #~ val =  re.search(r".r[0-9]{2,4}", fno['subject'], re.I)
            val_sample = re.search(r"[\.\-]sample", fno['subject'], re.I)
            if (val_sample is not None):
                continue
            par2idx = fno['subject'].lower().find('.par2')
            if (par2idx != -1):
                typefile = self.MSGTYPE_PAR2IDX
                fileinfo['pars'] = fileinfo['pars'] + 1
                npar_vol = re.search(r".vol[0-9]{1,4}",
                                     fno['subject'][1:par2idx + 5], re.I)
                if (npar_vol is not None):
                    typefile = self.MSGTYPE_PAR2VOL
            if (fno['subject'].lower().find('.nfo') != -1):
                typefile = self.MSGTYPE_NFO
                fileinfo['nfo'] = fileinfo['nfo'] + 1
            if (fno['subject'].lower().find('.sfv') != -1):
                typefile = self.MSGTYPE_SFV
                fileinfo['sfv'] = fileinfo['sfv'] + 1
            if (fno['subject'].lower().find('.nzb') != -1):
                typefile = self.MSGTYPE_NZB
                fileinfo['nzb'] = fileinfo['nzb'] + 1

            if (typefile == 0):
                allfiles[h.unescape(fno['subject'])] = 1

            cur_group = []
            for g in groups:
                g_groups = g.findAll('group')
                for g2 in g_groups:
                    cur_group.append(''.join(g2.findAll(text=True)))

            for s in segs:
                s_segs = s.findAll('segment')
                fsggs = fsggs + len(s_segs)
                postid = []
                for s2 in s_segs:
                    nbytes += int(s2['bytes'])
                    subject = h.unescape(fno['subject'])
                    keyname = re.findall(r'\"(.+?)\"', subject)[0]
                    if (keyname not in allfiles_LUT):
                        allfiles_LUT[keyname] = []
                    allfiles_LUT[keyname].append(len(filesegs))

                    filesegs.append([
                        subject,
                        int(s2['bytes']), typefile,
                        h.unescape(''.join(s2.findAll(text=True))), cur_group,
                        self.STATUS_INIT, -2
                    ])

        #~ except Exception as e:
        #~ print "Error, could not parse NZB file " + str(e)
        #~ sys.exit()
        allfiles_sorted = []
        allfiles_sorted_clean = []
        for key in allfiles:
            allfiles_sorted.append(key)
        allfiles_sorted = sorted(allfiles_sorted)
        for s in allfiles_sorted:
            allfiles_sorted_clean.append(re.findall(r'\"(.+?)\"', s)[0])

        self.infodata = {}
        self.infodata['summary'] = fileinfo
        self.infodata['detail'] = filesegs
        self.infodata['subject'] = allfiles_sorted
        self.infodata['filename'] = allfiles_sorted_clean
        self.infodata['filename_LUT'] = allfiles_LUT

示例#9

0

显示文件

文件： DeepsearchModule.py 项目： mwilborne/usntssearch

	def search(self, srchstr):
		if(self.cur_cfg['valid'] == 0):
			return []
		
		socket.setdefaulttimeout(self.timeout)
		
		#~ WIN: it seems to have issue in win32
		# locale.setlocale( locale.LC_ALL, 'en_US.utf8' )
		
		if	(self.chkcookie() == False):
			if(self.dologin() == False):
				return []

		mainurl = self.cur_cfg['url']
		loginurl = mainurl + "/search/"+srchstr
		timestamp_s = time.time()	
		try:
			socket.setdefaulttimeout(self.timeout)
			res = self.br.open(loginurl)
		except Exception as e:
			self.mech_error_generic(e)
			eret = self.mech_error_generic(e)
			print eret
			if(eret == 302):
				self.reset_cookies()
			return []	

		data = res.get_data()  
		timestamp_e = time.time()
		log.info('TS ' + mainurl + " " + str(timestamp_e - timestamp_s))

		soup = beautifulsoup.BeautifulSoup(data)

	#~ def searchDBG(self, srchstr):
		#~ handler = open('tmp/tater.html').read()
		#~ soup = BeautifulSoup (handler)
		
		parsed_data = []
		titles = soup.findAll('a', {'class': 'title'})
		nzburls = soup.findAll('a', {'title': 'Download Nzb'})
		tstamp_raw = soup.findAll('td', {'class': 'less mid'})
		rdetails = soup.findAll('a', {'title': 'View details'})
		sz_raw = soup.findAll('td', {'class': 'less right'})

		bytesize = []
		for sz1 in sz_raw:
			#~ rawline = str(sz1).split()
			for sz2 in sz1.findAll(text=True):
				sz2s =  sz2.split()
				

				if(len(sz2s) == 2):
					#~ print sz2s[1].lower()
					if (sz2s[1].lower() == 'mb' ):
						bytesize.append(int(self.basic_sz * float(sz2s[0].replace(',', '')) ))
					if (sz2s[1].lower() == 'gb' ):
						bytesize.append(int(self.basic_sz * float(sz2s[0].replace(',', '')) * 1024))
		#~ print bytesize

		#~ 2010-05-08 18:53:09
		tstamp = []
		for tt in tstamp_raw:
			for tt2 in tt.attrs:
				#~ print tt2[1]
				if('title' in tt2):
					tstamp.append( time.mktime(datetime.datetime.strptime(tt2[1], "%Y-%m-%d %H:%M:%S").timetuple()) )

		if(len(titles) != len(nzburls)):
			return []
		if(len(titles) != len(tstamp)):
			return []
		if(len(titles) != len(rdetails)):
			return []
		if(len(titles) != len(bytesize)):
			return []
			

		for i in xrange(len(titles)):
			d1 = {
				'title': ''.join(titles[i].findAll(text=True)),
				'poster': 'poster',
				'size': bytesize[i],
				'url': self.baseURL + '/' + nzburls[i]['href'],
				'filelist_preview': '',
				'group': 'N/A',
				'posting_date_timestamp': tstamp[i],
				'release_comments': self.baseURL  + rdetails[i]['href'],
				'categ':{'N/A':1},
				'ignore':0,
				'req_pwd':self.typesrch,
				'provider':self.baseURL,
				'providertitle':self.name
			}
			#~ print d1
			parsed_data.append(d1)
		
		
		return parsed_data