Python BeautifulSoup.findAll примеры, BeautifulSoup.BeautifulSoup.findAll Python примеры использования

Пример #1

0

Показать файл

Файл: bandm_lib.py Проект: VinDuv/pipobot-modules

def lyrics():
    """ Extracts lyrics from the current sont in 'bide et musique' """
    res = ""
    page = urllib.urlopen(HOME_PAGE)
    content = page.read()
    page.close()
    soup = BeautifulSoup(content)
    souptitle = soup.findAll("p", {"class": "titre-song"})[0]
    title = souptitle.text
    artist = soup.findAll("p", {"class": "titre-song2"})[0].text
    souptitle = soup.findAll("p", {"class": "titre-song"})[0]
    url = "http://www.bide-et-musique.com"
    url = "%s%s" % (url, souptitle.a.get("href"))
    page = urllib.urlopen(url)
    content = page.read()
    page.close()
    soup = BeautifulSoup(content)
    tab = soup.findAll("td", {"class": "paroles"})
    if tab == []:
        res = "Pas de paroles disponibles pour %s de %s" % (artist, title)
    else:
        tab = tab[0].contents
        res = "%s - %s\n%s\n" % (artist, title, "*" * 30)
        lyrics_content = ""
        for elt in tab:
            tmp = elt
            if str(tmp).lstrip() != "<br />":
                lyrics_content += xhtml2text(unicode(tmp).lstrip()) + "\n"
        res += lyrics_content
    return xhtml2text(res)

Пример #2

0

Показать файл

Файл: bestbrilink.py Проект: jansenicus/python-scripts

def getAvailabilityRank(table):

	try:

		#print "getting List of ATMs requires attention..."
	
		soup = BeautifulSoup(str(table))
	
		rows = soup.findAll('tr')

		numRows = getRowsNumber(table)

		numRowsHead = getRowsHeadNumber(table)

	
		arrBestBranchBri = []
		
		for a in range (2, numRows-1):

			trs = BeautifulSoup(str(rows[a]))
			tdcells = trs.findAll("td")

			percentAvailBri = float(tdcells[17].getText())
			ukerName = cleanUpNamaUker(tdcells[0].getText())

			if (percentAvailBri == 100.00):

				#arrBestBranch.append(ukerName+", "+jumlahATM)
				arrBestBranchBri.append(ukerName)

	except IndexError:

		arrBestBranchBri = getAvailabilityRank(table)

	return sorted(arrBestBranchBri)

Пример #3

0

Показать файл

Файл: iwiw.py Проект: gazs/Cyrano

 def firstn(self, count=5, **kwargs):
   url = "http://iwiw.hu/search/pages/user/ajaxsearch.jsp?do=AdvancedSearch&page=0&"
   url += urlencode(kwargs)
   iwiwsearch = urlfetch.fetch(url, headers={'Cookie': self.logincookie}).content
   #try:
   leves = BeautifulSoup(iwiwsearch)
   mennyivan = len(leves.findAll("div", "cardContainer"))
   count=int(count)
   if mennyivan - count >= 0:
     mennyit = count
   if mennyivan - count < 0:
     mennyit = mennyivan
   results = []
   for i in range(mennyit):
     ez = leves.findAll("div", "cardContainer")[i]
     # userid = ez.find("a")["name"].replace("uid","")
     ebben_van_a_popup_url = ez.findChildren("a")[2]["onclick"]
     pic_popup_url = re.search("'.*?'", ebben_van_a_popup_url).group(0)
     pic_thumbnail = ez.img["src"]
     name = ez.findChildren("a")[1].contents[0]
     profile_url = ez.findChildren("a")[1]["href"]
     result = {
         "name": name,
         "profile_url": profile_url,
         "pic_thumbnail": pic_thumbnail,
         "pic_popup_url": pic_popup_url
         }
     # result beleírása
     results.append(result)
   return results

Пример #4

0

Показать файл

Файл: fetchStatusCRM.py Проект: jansenicus/python-scripts

def getCDMStats(table):

	soup = BeautifulSoup(str(table))
	
	rows = soup.findAll('tr')

	numRows = getRowsNumber(table)

	numRowsHead = getRowsHeadNumber(table)

	#print numRowsHead, numRows

	msgBody = ""
	
	for i in range (0, numRows):

		trs = BeautifulSoup(str(rows[i]))

		tdcells = trs.findAll("td")
		thcells = trs.findAll("th")

		#print len(tdcells), len(thcells)

		if thcells:

			msgBody += "\n*" + thcells[0].getText().upper() + "*\n----------------------------------\n"

		if tdcells:

			if len(tdcells) > 1:

				msgBody += tdcells[0].getText().upper()+": "+ asterisk(tdcells[1].getText()) +"\n"


	return msgBody.replace("_"," ")

Пример #5

0

Показать файл

Файл: Main.py Проект: Backmute/seppius-xbmc-repo

  def getMovieData(self):
    list = []
    #-- get serial play list & parameters  -------------------------------------
    html = self.Auth.get_HTML(self.serial_url, None, 'http://serialu.net/media/uppod.swf')

    # -- parsing web page
    html = re.compile('<body>(.+?)<\/body>', re.MULTILINE|re.DOTALL).findall(html)[0]
    soup = BeautifulSoup(html)
    pl_url = ''

    is_multiseason = len(soup.findAll('object', {'type':'application/x-shockwave-flash'}))

    for rec in soup.findAll('object', {'type':'application/x-shockwave-flash'}):
        if is_multiseason > 1:
            season = rec.parent.previousSibling.previousSibling.text+r' '
        else:
            season = r''

        for par in rec.find('param', {'name':'flashvars'})['value'].split('&'):
            if par.split('=')[0] == 'pl':
                pl_url = par[3:]

        if pl_url.find('http:') == -1:
            pl_url = xppod.Decode(pl_url)

        #-- get playlist details ---------------------------------------------------
        html = self.Auth.get_HTML(pl_url, None, 'http://serialu.net/media/uppod.swf')
        self.pl_url = pl_url

        # -- check if playlist is encoded
        if html.find('{"playlist":[') == -1:
            html = xppod.Decode(html).encode('utf-8').split(' or ')[0] #-- TODO: make smart choice

        # -- parsing web page
        s_url = ''
        s_num = 0
        movie_list = []
        for rec in re.compile('{(.+?)}', re.MULTILINE|re.DOTALL).findall(html.replace('{"playlist":[', '')):
            for par in rec.replace('"','').split(','):
                if par.split(':')[0]== 'comment':
                    name = str(s_num+1) + ' серия' #par.split(':')[1]+' '
                if par.split(':')[0]== 'file':
                    if 'http' in par.split(':')[1]:
                        s_url = par.split(':')[1]+':'+par.split(':')[2]
                    else:
                        s_url = xppod.Decode(par.split(':')[1]).split(' or ')[0]
            s_num += 1

            # mark part for history
            name = season.encode('utf-8') + name

            movie_list.append({'movie_name': name, 'url': s_url})
            #if h_part <> '-':
            #    if name == h_part:
            #        name = '[COLOR FF00FF00]'+name+'[/COLOR]'
        #-- parse data
        list.append({'name':self.serial_name, 'img': self.serial_img, 'descr': self.serial_descr, 'season_number':s_num, 'name_orig':'', 'movie': movie_list})

    #-- return movie list
    return list

Пример #6

0

Показать файл

Файл: __init__.py Проект: erktheerk/image-scraper

	def parseImgLinks(self,depth=1):
		url_response = None
		try:
			url_response = urllib2.urlopen(self.scrap_url,timeout=self._timeout)
		except Exception as e:
			print("   [ERROR]: Could not open {0}: {1}".format(self.scrap_url,e.reason))
			return self.img_list

		html_parse = BeautifulSoup(url_response)
		unique_images_found = 0
		total_images_found = 0
		self.visited[self.scrap_url] = 1

		for img in html_parse.findAll('img'):
			try:
				abs_url = urljoin(self.scrap_url,img['src']) if urlparse(img['src']).netloc == "" else img['src']
				if abs_url not in self.img_list:
					self.img_list.add(abs_url)
					unique_images_found += 1
				total_images_found += 1
			except:
				pass

		print("   [Found %d images / %d new]: %s" % (total_images_found,unique_images_found,self.scrap_url))
		if depth > 1:
			for a in html_parse.findAll('a'):
				try:
					if (urlparse(a['href']).netloc == "") or (urlparse(self.scrape_url_orig).netloc == urlparse(a['href']).netloc):
						self.scrap_url = urljoin(self.scrape_url_orig,a['href'])
						if self.scrap_url in self.visited: continue
						self.parseImgLinks(depth - 1)
				except:
					pass
		return self.img_list

Пример #7

0

Показать файл

Файл: AWSRequests.py Проект: artikz/cms

    def get_submission_info(self):
        # Only valid after self.execute()
        # Parse submission information out of response.
        soup = BeautifulSoup(self.res_data)

        info = {}

        # Get submission status.
        tag = soup.findAll(id="submission_status")[0]
        info['status'] = tag.text.strip()

        # Get compilation text.
        tags = soup.findAll(id="compilation")
        if tags:
            content = tags[0]
            info['compile_output'] = content.pre.text.strip()
        else:
            info['compile_output'] = None

        # Get evaluation results.
        evaluations = []
        tags = soup.findAll(id=re.compile(r"^eval_outcome_"))
        text_tags = soup.findAll(id=re.compile(r"^eval_text_"))
        for outcome_tag, text_tag in zip(tags, text_tags):
            # Get evaluation text also.
            evaluations.append({
                'outcome': outcome_tag.text.strip(),
                'text': text_tag.text.strip(),
            })

        info['evaluations'] = evaluations

        return info

Пример #8

0

Показать файл

Файл: views.py Проект: tsimkins/svn-import-agSciencesCollege

    def __call__(self):
    
        html = self.render()
    
        soup = BeautifulSoup(html)
        
        for img in soup.findAll('img', {'class' : 'leadimage'}):
            img['hspace'] = 8
            img['vspace'] = 8


        utm  = self.getUTM(source='newsletter', medium='email', campaign=self.newsletter_title);

        for a in soup.findAll('a'):
            if '?' in a['href']:
                a['href'] = '%s&%s' % (a['href'], utm) 
            else:
                a['href'] = '%s?%s' % (a['href'], utm)            

        html = premailer.transform(soup.prettify())

        tags = ['dl', 'dt', 'dd']
        
        for tag in tags:
            html = html.replace("<%s" % tag, "<div")
            html = html.replace("</%s" % tag, "</div")
            
        return html

Пример #9

0

Показать файл

Файл: lyrics.py Проект: rikels/LyricsSearch

def LyricWikia(artist, title):
	url = 'http://lyrics.wikia.com/api.php?action=lyrics&artist={artist}&song={title}&fmt=json&func=getSong'.format(artist=artist,
																													title=title).replace(" ","%20")
	r = requests.get(url, timeout=15)
	# We got some bad formatted JSON data... So we need to fix stuff :/
	returned = r.text
	returned = returned.replace("\'", "\"")
	returned = returned.replace("song = ", "")
	returned = json.loads(returned)
	if returned["lyrics"] != "Not found":
		# set the url to the url we just recieved, and retrieving it
		r = requests.get(returned["url"], timeout=15)
		soup = BeautifulSoup(r.text)
		soup = soup.find("div", {"class": "lyricbox"})
		[elem.extract() for elem in soup.findAll('div')]
		[elem.replaceWith('\n') for elem in soup.findAll('br')]
		#with old BeautifulSoup the following is needed..? For recent versions, this isn't needed/doesn't work
		try:
			soup = BeautifulSoup(str(soup), convertEntities=BeautifulSoup.HTML_ENTITIES)
		except:
			pass
		soup = BeautifulSoup(re.sub(r'(<!--[.\s\S]*-->)', '', str(soup)))
		[elem.extract() for elem in soup.findAll('script')]
		return(soup.getText())
	else:
		return()

Пример #10

0

Показать файл

Файл: fms_team_list_parser.py Проект: Captain-Dude/the-blue-alliance

    def parse(self, html):
        """
        Parse the information table on USFIRSTs site to extract team information.
        Return a list of dictionaries of team data.
        """
        teams = list()
        soup = BeautifulSoup(html,
                             convertEntities=BeautifulSoup.HTML_ENTITIES)

        for title in soup.findAll('title'):
            if "FRC Team/Event List" not in title.string:
                return None

        team_rows = soup.findAll("pre")[0].string.split("\n")

        for line in team_rows[2:]:  # first is blank, second is headers.
            data = line.split("\t")
            if len(data) > 1:
                try:
                    teams.append({
                        "team_number": int(data[1]),
                        "name": data[2],
                        "short_name": data[3],
                        "nickname": data[7]
                    })
                except Exception, e:
                    logging.warning("Failed to parse team row: %s" % data)

Пример #11

0

Показать файл

Файл: crawler.py Проект: chu888chu888/Crawler-python-tbcrawler

def searchcrawler(url,keyword=''):
    """
    tb搜索页爬虫
    """
    html=get_html(url)
    #print html
    if html:
        soup = BeautifulSoup(html,fromEncoding='gbk')
        items_row = soup.findAll('div',{'class':'row item icon-datalink'})
        if items_row:
            print '=======================row search row=========================='
            #print items
            for item in items_row:
                item_info = item.find('div',{'class':'col title'}).h3.a
                item_url = item_info['href']
                url_info = urlparse.urlparse(item_url)
                item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
                print item_url
                print item_id
                judge_site(item_url,keyword)
        items_col = soup.findAll('div',{'class':'col item icon-datalink'})
        if items_col:
            print '=======================row search col=========================='
            #print items
            for item in items_col:
                item_info = item.find('div',{'class':'item-box'}).h3.a
                item_url = item_info['href']
                url_info = urlparse.urlparse(item_url)
                item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
                print item_url
                print item_id
                judge_site(item_url,keyword)

Пример #12

0

Показать файл

Файл: parser.py Проект: henryluki/readdaily

def httpParserGuokr(url):
	'''
	果壳：content title id updated
	'''
	content=httpRequest(url)#发送请求
	soup=BeautifulSoup(content)
	item=soup.findAll('content')
	title=soup.findAll('title')
	link=soup.findAll('id')
	pubDate=soup.findAll('updated')
	article=title[0].text
	content=[]
	counter=0
	for t in range(len(title)):
		if t!=0 and t<=6:
			num=7-t
			string=httpXpath(item[num-1].contents[0].replace('&amp;','&').replace( '&lt;','<').replace('&gt;','>').replace('&quot;','"'))
			newtime=datetime.datetime.strptime(pubDate[num].contents[0][:10],"%Y-%m-%d")
			newtime=newtime.strftime('%Y年%m月%d日')
			arr={}
			arr['article']=article
			arr['title']=title[num].contents[0]
			arr['link']=link[num].contents[0]
			arr['pubdate']=newtime
			arr['brief']=string
			arr['descr']=item[num-1].contents[0].encode('utf-8').replace('&amp;','&').replace( '&lt;','<').replace('&gt;','>').replace('&quot;','"')
			arr['content']=""
			if counter==0:
				status=Article.objects.filter(link=link[num].text).exists()
				if not status:
					RssData().keepData(arr)
					counter=1	
			else:
				RssData().keepData(arr)

Пример #13

0

Показать файл

Файл: phishgate.py Проект: CyberIntelMafia/Cooper

def replaceURL(URL,OUTPUT):
	# Provide user feedback
	print "[+] Replacing the URLs in the HTML source."
	print "[+] URLs that will be replaced:"
	# Open source, read lines, and begin parsing to replace all URLs for scripts and links
	try:
	# Print href URLs that will be replaced
		print "\n".join(re.findall('<a href="?\'?([^"\'>]*)', open(OUTPUT).read()))
		with open(OUTPUT, "r+b") as html:
			# Read in the source html and parse with BeautifulSoup
			soup = BeautifulSoup(html)
			# Find all links and replace URLs with our new text/URLs
			for link in soup.findAll('a', href=True):
				link['href'] = '{{links.phishgate}}'
			for link in soup.findAll('link', href=True):
				link['href'] = urlparse.urljoin(URL, link['href'])
			for link in soup.findAll('script', src=True):
				link['src'] = urlparse.urljoin(URL, link['src'])
			source = soup.prettify()
			source = xml.sax.saxutils.unescape(source)

			# Write the updated URLs to output file while removing the [' and ']
			output = open(OUTPUT, "w")
			output.write(source.replace('[','').replace(']',''))
			output.close()
			print "[+] URL parsing successful. All URLs have been replaced."
	except:
		print "[-] URL parsing failed. Make sure the html file exists and is readable."

Пример #14

0

Показать файл

Файл: mirror.py Проект: psuedoelastic/World_of_VNC

def parseing(url,id,ResultList):
	pDict={}
	req = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"}) 
	con = urllib2.urlopen( req )
	html = con.read()
	#print html

	soup = BeautifulSoup(html)

	images = soup.findAll('img', {'class':'img-responsive screengrab'})
	#for image in images:
	#	print image.get('src')
	image_url = 'https://worldofvnc.net/' + images[0].get('src')
	download_photo(image_url,str(id)+'.jpg')
  	ConvertImage(str(id))
	pDict.update({'image_url':image_url})
	pDict.update({'local_image_name':str(id)+'.png'})

	

	listdata= soup.findAll('li', {'class':'list-group-item'})
	for x in listdata:
		#print x.b.contents[0], x.contents[1]
		pDict.update({x.b.contents[0]:x.contents[1]})

	ResultList.append(pDict)
	#dataDict.update({'VPN_ID':str(id), 'VPN_Content':pDict})
	#print "inner data" 
	#print ResultList


  	return ResultList

Пример #15

0

Показать файл

Файл: traintrain.py Проект: bolk/train-train

	def _parseHtml (self, html):
		"""Parse HTML"""
		ret = []
		soup = BeautifulSoup (html)
		for div in soup.findAll ("div", {"class":  "giaeffettuate"}) + soup.findAll ("div", {"class":  "corpocentrale"}):
			station = div.find ("h2")
			station = str (station.contents[0])

			# Now get the time
			prog = None
			real = None
			tag = None
			for p in div.findAll ("p"):
				t = str (p.contents[0])
				time = p.find ("strong")
				if len (time.contents) > 0:
					time = str (time.contents[0])
				else:
					time = "00:00"
				if re.search ("(?i)programmat(a|o)", t):
					prog = time.rstrip().lstrip()
				elif re.search ("(?i)effettiv(a|o)", t):
					real = time.rstrip().lstrip()
					tag = "eff"
				elif re.search ("(?i)previst(a|o)", t):
					real = time.rstrip().lstrip()
					tag = "est"
			assert (prog is not None and real is not None and tag is not None)
			print station
			print prog
			print real
			print tag
			e = (station, prog, real, self.timediff (prog, real), tag)
			ret.append (e)
		return ret

Пример #16

0

Показать файл

Файл: make_train_set.py Проект: rodrwan/obj_rec_def

    def read_annotation(self):
        files = [f for f in self.annotation_set if isfile(join(self.annotation_path, f))]
        cfile = 1
        nfile = len(files)
        for fi in files:
            #print "Reading trainval "+str(cfile)+"/"+str(nfile)
            cfile+=1
            file_name = self.annotation_path + fi
            f = open(file_name, 'r')
            content = f.read()
            y = BeautifulSoup(str(content))
            img = self.image_path + y.filename.string

            for obj in y.findAll('object'):
                bs_obj = BeautifulSoup(str(obj))
                name = bs_obj.findAll('name')
                if name[0].string == self.cls:
                    cls = 1
                else:
                    cls = 0

                bndbox = bs_obj.findAll('bndbox')
                (x,y,w,h) = (
                    int(bndbox[0].xmax.string),
                    int(bndbox[0].xmin.string),
                    int(bndbox[0].ymax.string),
                    int(bndbox[0].ymin.string)
                )
                data = img, (x,y,w,h), cls
                if cls == 1:
                    self.train_set_image_pos.append(data)
                else:
                    self.train_set_image_neg.append(data)

Пример #17

0

Показать файл

Файл: turkpipe.py Проект: reubano/turkpipe

def makeHTMLQuestion(fn, htmldata):
  soup = BeautifulSoup(htmldata)
  #add JS
  soup.find('body')['onload'] = "populateAssignmentID('myAssignmentId')"
  soup.find('head').insert(0, SUBMIT_JS)
  #replace forms
  forms = soup.findAll('form')
  if forms:
    for form in forms:
      if not form.has_key('method'):
        form['method'] = 'POST'
      if not form.has_key('action'):
        if testmode:
          form['action'] = 'http://workersandbox.mturk.com/mturk/externalSubmit'
        else:
          form['action'] = 'http://www.mturk.com/mturk/externalSubmit'
      if not form.has_key('onSubmit'):
        form['onSubmit'] = "return verifyTurkSubmit('myAssignmentId');"
      inputtag = Tag(soup,'input')
      inputtag['type'] = 'hidden'
      inputtag['name'] = 'assignmentId'
      inputtag['id'] = 'myAssignmentId'
      inputtag['value'] = ''
      form.insert(0, inputtag)
  mainurl = uploadfile(fn, str(soup))
  for sub in soup.findAll('img'):
    # TODO
    fn = dirname(fn) + '/' + sub['src']
    uploadfile(fn)
  return ExternalQuestion(escape(mainurl), frame_height)

Пример #18

0

Показать файл

Файл: NotifikasiTelegramATM.py Проект: jansenicus/python-scripts

def getPengelolaSupervisi(strTID):

	try:
		strURL = "http://172.18.65.42/statusatm/viewatmdetail.pl?ATM_NUM="+strTID

		strHTML = fetchHTML(strURL)
		table = getLargestTable(getTableList(strHTML))

		strHTMLTableRows = getSpecificRows(table, getRowIndex(table, "Replenish By"))
		mysoup = BeautifulSoup(strHTMLTableRows)
		arrTDs = mysoup.findAll('td')
		strPengelola = arrTDs[1].getText()

		strHTMLTableRows = getSpecificRows(table, getRowIndex(table, "KC Supervisi"))
		mysoup = BeautifulSoup(strHTMLTableRows)
		arrTDs = mysoup.findAll('td')
		strSupervisi = arrTDs[1].getText()

	except IndexError:

		strPengelola, strSupervisi = getPengelolaSupervisi(strTID)

	except RuntimeError:

		strPengelola, strSupervisi = "---", "KANWIL BRI JAKARTA III"

	return strPengelola, strSupervisi

Пример #19

0

Показать файл

Файл: politico.py Проект: ChrisMissal/newsdiffs

    def _parse(self, html):
        soup = bs4.BeautifulSoup(html)
        print_link = soup.findAll('a', text='Print')[0].get('href')
        html2 = grab_url(print_link)
        logger.debug('got html 2')
        # Now we have to switch back to bs3.  Hilarious.
        # and the labeled encoding is wrong, so force utf-8.
        soup = BeautifulSoup(html2, convertEntities=BeautifulSoup.HTML_ENTITIES,
                             fromEncoding='utf-8')

        self.meta = soup.findAll('meta')
        p_tags = soup.findAll('p')[1:]
        real_p_tags = [p for p in p_tags if
                       not p.findAll(attrs={'class':"twitter-follow-button"})]

        self.title = soup.find('strong').getText()
        entity = soup.find('span', attrs={'class':'author'})
        children = list(entity.childGenerator())
        try:
            self.byline = 'By ' + children[1].getText()
        except IndexError:
            self.byline = ''
        self.date = children[-1].strip()

        self.body = '\n'+'\n\n'.join([p.getText() for p in real_p_tags])

Пример #20

0

Показать файл

Файл: getLowestAvailability.py Проект: jansenicus/python-scripts

def getATMProbUKOCRO2(table):

	try:

		#print "getting List of ATMs requires attention..."
	
		soup = BeautifulSoup(str(table))
	
		rows = soup.findAll('tr')

		numRows = getRowsNumber(table)

		numRowsHead = getRowsHeadNumber(table)

		numProbUKO = 0
		numProbCRO = 0

		for i in range (numRowsHead, numRows):

			trs = BeautifulSoup(str(rows[i]))
			tdcells = trs.findAll("td")

			if "ATM CENTER" in tdcells[8].getText():

				numProbCRO = numProbCRO + 1

		numProbUKO = numRows - numProbCRO -numRowsHead

		#print "number of CRO problem(s)", numProbCRO, "number of UKO problem(s):", numProbUKO

	except IndexError:

		numProbUKO, numProbCRO = getATMProbUKOCRO(table)

	return int(numProbUKO), int(numProbCRO)

Пример #21

0

Показать файл

Файл: durex.py Проект: Kaibin/Condom_Data_Fetcher

    def get(self):
        content = self.request.content
        soup = BeautifulSoup(''.join(content))

        #所有text已经被自动转为unicode，如果需要，可以自行转码encode(xxx)
        title = soup.html.body.h1
        if not title:
            return
        title = title.text
        subtitle = soup.findAll(attrs={'class':'f_cy f_s16b'})[0].string
        description = soup.find(attrs={'class':'f_cy f_s14 pt20'})
        description = description.text if description else ''
        smooth_index = soup.findAll(attrs={'class':'pt20'})[0]
        smooth_index = smooth_index.text if smooth_index else ''
        information = soup.findAll(attrs={'class':'pt20'})[1]
        information = information.text if information else ''
        tips = soup.find(attrs={'class':'f_s14 pt20'})
        tips = tips.text + tips.nextSibling.nextSibling.text if tips else ''

#        pics = soup.findAll('a', href = re.compile(r'pic\d'))
        pics = soup.findAll(attrs={'class':'pic1'})
        if pics:
            imageList = []
            for pic in pics:
                img = pic.find('img')['src']
                imageList.append(img)
                spider.put(HTTP%img)

        self.page.append((self.request.url, title, subtitle, description, smooth_index, information, tips, imageList))

Пример #22

0

Показать файл

Файл: vlive_tags.py Проект: praekelt/ummeli

def sanitize_html(value):
    from BeautifulSoup import BeautifulSoup, Comment, Tag

    # FIXME: 'None' should never be saved as text
    if value is None:
        return ""

    # allowed tags for a Vodafone Live <CONTAINER type="data" />
    # this doubles up as a translation table. CKEditor does new-ish
    # HTML than Vodafone Live will accept. We have to translate 'em' back
    # to 'i', and 'strong' back to 'b'.
    #
    # NOTE: Order is important since <strong>'s can be inside <p>'s.
    tags = (
        ("em", "i"),  # when creating them in the editor they're EMs
        ("strong", "b"),
        ("i", "i"),  # when loading them as I's the editor leaves them
        ("b", "b"),  # we keep them here to prevent them from being removed
        ("u", "u"),
        ("br", "br"),
        ("p", "p"),
    )
    valid_tags = [tag for tag, replacement_tag in tags]
    soup = BeautifulSoup(value)

    # remove all comments from the HTML
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # hide all tags that aren't in the allowed list, but keep
    # their contents
    for tag in soup.findAll(True):
        # Vodafone Live allows for no tag attributes
        tag.attrs = []
        if tag.name not in valid_tags:
            tag.hidden = True

    # replace tags with Vlive equivelants
    for element, replacement_element in tags:
        if element is not replacement_element:
            for tag in soup.findAll(element):
                replacement_tag = Tag(soup, replacement_element)
                replacement_tag.insert(0, tag.text)
                tag.replaceWith(replacement_tag)

    xml = soup.renderContents().decode("utf8")
    fragment, errors = tidy_fragment(xml, {"char-encoding": "utf8"})

    return (
        fragment.replace("&nbsp;", " ")
        .replace("&rsquo;", "'")
        .replace("&lsquo;", "'")
        .replace("&quot;", '"')
        .replace("&ldquo;", '"')
        .replace("&rdquo;", '"')
        .replace("&bull;", "- ")
        .replace("&eacute;", "e")
        .replace("&Eacute;", "E")
        .replace("&ndash;", "-")
    )

Пример #23

0

Показать файл

Файл: baseurl.py Проект: anatollix/django-attach

def baseurl(html, base):
    if not base.endswith('/'):
        base += '/'

    absurl = re.compile(r'\s*[a-zA-Z][a-zA-Z0-9\+\.\-]*:')  # Starts with scheme:.
    def isabs(url):
        return url.startswith('/') or absurl.match(url)

    soup = BeautifulSoup(html)

    for link in soup.findAll('a', href=True):
        if not isabs(link['href']):
            link['href'] = base + link['href']

    for img in soup.findAll('img', src=True):
        if not isabs(img['src']):
            img['src'] = base + img['src']

    elements = soup.findAll(style=True) # All styled elements.
    for e in elements:
        def func(m):
            url = m.group(2)
            if not isabs(url):
                url = base + url
            return m.group(1) + url + m.group(3)

        e['style'] = re.sub(r'''(url\(\s*)([^\s\)\"\']*)(\s*\))''', func, e['style'])
        e['style'] = re.sub(r'''(url\(\s*")([^\s\"]*)("\s*\))''', func, e['style'])
        e['style'] = re.sub(r'''(url\(\s*')([^\s\']*)('\s*\))''', func, e['style'])

    return unicode(soup)

Пример #24

0

Показать файл

Файл: content.py Проект: RadixSeven/everpad

 def content(self, val):
     """Set content"""
     soup = BeautifulSoup(val)
     for todo in soup.findAll('en-todo'):
         todo.name = 'input'
         todo['type'] = 'checkbox'
         if todo.get('checked') == 'false':
             del todo['checked']
         self.changed_by_default = True
     for media in soup.findAll('en-media'):
         if media.get('hash'):  # evernote android app error
             media.name = 'img'
             res = self.parent.resource_edit.get_by_hash(media['hash'])  # shit!
             if res:
                 if media['type'].find('image') == 0:
                     media['src'] = 'file://%s' % res.file_path
                 else:
                     media['src'] = file_icon_path
                 media['title'] = res.file_name
                 res.in_content = True
             else:
                 media['src'] = ''
                 media['title'] = ''
         else:
             media.hidden = True
     # set tables id's for identifing on hover
     for num, table in enumerate(soup.findAll('table')):
         table['id'] = 'table_%d' % num
         self._last_table_num = num
     self._content = re.sub(
         r'(&nbsp;| ){5}', '<img class="tab" />',
         unicode(soup).replace(u'\xa0', ' '),
     )  # shit!
     self.apply()

Пример #25

0

Показать файл

Файл: test_my_rsr.py Проект: punchagan/akvo-rsr

    def test_projects_access_restrictions(self):
        # Given an org with two projects
        project1 = Project.objects.create(title='Project 1')
        project1.publish()
        project2 = Project.objects.create(title='Project 2')
        project2.publish()
        Partnership.objects.create(organisation=self.org, project=project1)
        Partnership.objects.create(organisation=self.org, project=project2)
        Employment.objects.create(
            user=self.user3, organisation=self.org, group=self.user_group, is_approved=True
        )

        # When project1 is added to user1's project white list
        white_list = UserProjects.objects.create(user=self.user3, is_restricted=True,)
        white_list.projects.add(project1)

        # Then user1's project list should include only project1
        self.c.login(username=self.user3.username, password=self.password)
        url = '/myrsr/projects'
        response = self.c.get(url, follow=True)
        from BeautifulSoup import BeautifulSoup
        soup = BeautifulSoup(response.content)

        self.assertEqual(len(soup.findAll('table')), 1)
        # There should be two table rows: one header and one for project1
        self.assertEqual(len(soup.findAll('table')[0].findChildren('tr')), 2)

Пример #26

0

Показать файл

Файл: __init__.py Проект: cooperhewitt/metmuseum-tools

    def extract_label(self, url):

        rsp = self.ua.open(url)
        text = rsp.read()

        soup = BeautifulSoup(text)

        label = soup.findAll('div', {'id': 'gallery-label'})
        label = label[0]
        label = label.text

        tombstone = soup.findAll('div', {'class': 'tombstone'})
        tombstone = tombstone[0]

        sections = tombstone.findAll('div')

        data = {
            'Chat': label
        }

        for sect in sections:
            text = sect.text
            stuff = text.split(":")

            k = stuff[0]
            v = stuff[1]
            
            data[k] = v

        return data

Пример #27

0

Показать файл

Файл: fetchStatusATMOPS2.py Проект: jansenicus/python-scripts

def getATMStats(table):

	soup = BeautifulSoup(str(table))
	
	rows = soup.findAll('tr')

	numRows = getRowsNumber(table)

	numCols = getColsNumber(table)

	numRowsHead = getRowsHeadNumber(table)

	#print numRowsHead, numRows
	msgBody = ""

	seqNo = 0
	
	for i in range (0, numRows):

		trs = BeautifulSoup(str(rows[i]))

		tdcells = trs.findAll("td")
		thcells = trs.findAll("th")

		#print len(tdcells), len(thcells)



		if tdcells:


			msgBody += "\n"+tdcells[0].getText().upper()+") "+ tdcells[1].getText()+", " + tdcells[2].getText().replace("HYOSUNG","HYOSUNG ") +"\nLOKASI: "+ tdcells[4].getText() +"\nAREA: "+ tdcells[5].getText()+"\nDURASI: "+ tdcells[6].getText().replace("days","hari ").replace("hours","jam") +"\n"+"\n"
	if msgBody == "":
		msgBody = "Tidak ada ATM PROBLEM OPS di wilayah kerja Anda."
	return msgBody

Пример #28

0

Показать файл

Файл: wiki_tags.py Проект: emacsway/pinax

def wiki_links(text, group=None):
    """Replaces CamelCase words to wiki-links."""
    from BeautifulSoup import BeautifulSoup

    autoescape = False
    safe_input = isinstance(text, SafeData)
    conditional_escape(text)
    soup = BeautifulSoup(text)

    for url in soup.findAll(text=wikiword_link):
        if url.parent.name == 'a':
            continue
        new_str = wikiword_link.sub(curry(_re_callback, inside=False, group=group), url)
        url.replaceWith(BeautifulSoup(new_str))

    soup = BeautifulSoup(str(soup))  # Fixed for bug in some versions of BS
    for a in soup.findAll('a'):
        url = a.get('href')
        if not url:
            continue
        new_str = wikiword_link_href.sub(curry(_re_callback, inside=True, group=group), url)
        if new_str != url:
            a['href'] = new_str

    result = force_unicode(soup)
    if safe_input:
        result = mark_safe(result)
    elif autoescape:
        result = escape(result)
    return result

Пример #29

0

Показать файл

Файл: contacts.py Проект: AuroraSkywalker/watchdog

    def GET(self):
        i = web.input()
        appid = i.get('appid').rstrip()
        email, url, title = i.get('appdata', '||').split('|')
        userhash = i.get('userhash')        
        ts = i.get('ts')        
        token = i.get('token')        
        query = urllib.urlencode(dict(url=url, title=title))
        if not token:
            raise web.seeother('/share?%s' % query)
        #XXX: security verification etc..
        url = yahooLoginURL(email, '/WSLogin/V1/wspwtoken_login', token)
        try:
            resp = urllib2.urlopen(url)
        except:
            helpers.set_msg('Authorization Failed.')
            raise web.seeother('/share?%s' % query)

        content = resp.read()
        soup = BeautifulSoup(content)        
        aurl = 'http://address.yahooapis.com/v1/searchContacts?format=json'
        wssid = soup.findAll('wssid')[0].contents[0]        
        cookie =soup.findAll('cookie')[0].contents[0]        
        cookie = cookie.strip()        

        furl = aurl + '&fields=email,name&email.present=1&appid=%s&WSSID=%s' % (appid, wssid)
        req = urllib2.Request(furl)
        req.add_header('Cookie', cookie)
        req.add_header('Content-Type', 'application/json')
        response = urllib2.urlopen(req).read()
        contacts = self.get_contacts(response)
        save_contacts(email, contacts, provider='YAHOO')
        raise web.seeother('/share?%s' % query)

Пример #30

0

Показать файл

Файл: main.py Проект: artemesart/versionparser

def parse_site(url, sql_connection):    
    sql_cursor = sql_connection.cursor()    
    page_html = urllib2.urlopen(url)    
    soup = BeautifulSoup(page_html)
    
    pages = []
    page_nums = []
    for raw_a in soup.findAll('td', {'class':'page'}):
        page_num_text = raw_a.text 
        if page_num_text.encode('utf-8').strip() == u'···'.encode('utf-8').strip():                                
            pass
        else:            
            page_num = int(page_num_text)            
            if page_nums and (page_num - page_nums[-1]) > 1:
                for i in xrange(page_nums[-1], page_num + 1):
                    pages.append(url + 'index' + str(i) + ".html")
            page_nums.append(page_num)
            pages.append(PORTAL_NAME + str(raw_a.a['href']))
                        
    pages = unique(pages)        
    pages.append(url)    
        
    for pg in pages:    
        print pg                            
        ps = BeautifulSoup(urllib2.urlopen(pg))
        for item in ps.findAll('a', {'class':'itemname'}):
            try:
                print item.contents[0].strip()
                print item.span.string
                sql_cursor.execute("INSERT INTO parsed(site, program, version) VALUES(?, ?, ?)", [pg, item.contents[0].strip(), item.span.string])
            except AttributeError as e:
                sql_connection.rollback()
                continue
            else:
                sql_connection.commit()

Пример #31

0

Показать файл

Файл: externallink_internallink_cdf.py Проект: detectivelyw/past_projects

def html_analysis_average(html_path):
    ouput_filename = html_path + "-links.txt"
    f = open(ouput_filename, 'w')

    count_file_number = 0
    count_externaljavascript = 0
    count_cssexternallink = 0
    count_login_number = 0
    contain_login_number = 0

    tofinda = 'www'
    tofindb = 'com'
    tofindc = 'log'
    tofindd = 'sign'
    tofinde = 'register'

    for filename in os.listdir(html_path):
        count_page_valid = 0
        count_externallink = 0
        count_internallink = 0
        count_file_number = count_file_number + 1
        print filename
        try:
            soup = BeautifulSoup(file(html_path + filename))

            for link in soup.findAll('a'):
                if (link.get('href') != None):
                    if tofinda in link.get('href') or tofindb in link.get(
                            'href'):
                        count_externallink = count_externallink + 1
                        count_page_valid = count_page_valid + 1
                    if tofinda not in link.get(
                            'href') and tofindb not in link.get('href'):
                        count_internallink = count_internallink + 1
                        count_page_valid = count_page_valid + 1
                if (link.get('src') != None):
                    if tofinda in link.get('src') or tofindb in link.get(
                            'src'):
                        count_externallink = count_externallink + 1
                        count_page_valid = count_page_valid + 1
                    if tofinda not in link.get(
                            'src') or tofindb not in link.get('src'):
                        count_internallink = count_internallink + 1
                        count_page_valid = count_page_valid + 1
            for link in soup.findAll('img'):
                if (link.get('src') != None):
                    if tofinda in link.get('src') or tofindb in link.get(
                            'src'):
                        count_externallink = count_externallink + 1
                        count_page_valid = count_page_valid + 1
                    if tofinda not in link.get(
                            'src') or tofindb not in link.get('src'):
                        count_internallink = count_internallink + 1
                        count_page_valid = count_page_valid + 1
            for link in soup.findAll('img'):
                if (link.get('href') != None):
                    if tofinda in link.get('href') or tofindb in link.get(
                            'href'):
                        count_externallink = count_externallink + 1
                        count_page_valid = count_page_valid + 1
                    if tofinda not in link.get(
                            'href') or tofindb not in link.get('href'):
                        count_internallink = count_internallink + 1
                        count_page_valid = count_page_valid + 1
            for link in soup.findAll('div'):
                if (link.get('href') != None):
                    if tofinda in link.get('href') or tofindb in link.get(
                            'href'):
                        count_externallink = count_externallink + 1
                        count_page_valid = count_page_valid + 1
                    if tofinda not in link.get(
                            'href') and tofindb not in link.get('href'):
                        count_internallink = count_internallink + 1
                        count_page_valid = count_page_valid + 1
                if (link.get('src') != None):
                    if tofinda in link.get('src') or tofindb in link.get(
                            'src'):
                        count_externallink = count_externallink + 1
                        count_page_valid = count_page_valid + 1
                    if tofinda not in link.get(
                            'src') or tofindb not in link.get('src'):
                        count_internallink = count_internallink + 1
                        count_page_valid = count_page_valid + 1

            for link in soup.findAll('script'):
                if (link.get('src') != None):
                    if tofinda in link.get('src') or tofindb in link.get(
                            'src'):
                        count_externaljavascript = count_externaljavascript + 1
                        count_page_valid = count_page_valid + 1
                if (link.get('href') != None):
                    if tofinda in link.get('href') or tofindb in link.get(
                            'href'):
                        count_externaljavascript = count_externaljavascript + 1
                        count_page_valid = count_page_valid + 1

            for link in soup.findAll('link'):
                if (link.get('href') != None):
                    if tofinda in link.get('href') or tofindb in link.get(
                            'href'):
                        count_cssexternallink = count_cssexternallink + 1
                        count_page_valid = count_page_valid + 1
                if (link.get('src') != None):
                    if tofinda in link.get('src') or tofindb in link.get(
                            'src'):
                        count_cssexternallink = count_cssexternallink + 1
                        count_page_valid = count_page_valid + 1

            contain_login = False
            for link in soup.findAll('form'):
                if (link.get('name') != None):
                    if tofindc in link.get('name') or tofindd in link.get(
                            'name') or tofinde in link.get('name'):
                        count_login_number = count_login_number + 1
                        count_page_valid = count_page_valid + 1
                        contain_login = True
                elif (link.get('action') != None):
                    if tofindc in link.get('action') or tofindd in link.get(
                            'action') or tofinde in link.get('action'):
                        count_login_number = count_login_number + 1
                        count_page_valid = count_page_valid + 1
                        contain_login = True

            for link in soup.findAll('div'):
                if (link.get('name') != None):
                    if tofindc in link.get('name') or tofindd in link.get(
                            'name') or tofinde in link.get('name'):
                        count_login_number = count_login_number + 1
                        count_page_valid = count_page_valid + 1
                        contain_login = True
                elif (link.get('action') != None):
                    if tofindc in link.get('action') or tofindd in link.get(
                            'action') or tofinde in link.get('action'):
                        count_login_number = count_login_number + 1
                        count_page_valid = count_page_valid + 1
                        contain_login = True
            if contain_login == True:
                contain_login_number = contain_login_number + 1
            if count_page_valid != 0:
                f.write(str(count_externallink - count_internallink) + ", ")
        except:
            pass
    f.close()

Пример #32

0

Показать файл

Файл: Step 4.1. ParseAllReviewInfo-SingleThread.py Проект: HumanAutomationInteractionLab-HAIL/RateMyProfessorCrawler

def parseSourceCode(soup,rankIndexValue, 	name ,	reviewerID ,	TotalReviews ,	HelpfulVotes ,	crNumPercentHelpful ,	crNumFanVoters):
    soupParsed = BeautifulSoup(soup)

    # the second table is for the review content
    datapart= soupParsed.findAll("table",{"class":"small"})
    # review part is marked by "<div style="margin-left:0.5em;">"
    reviewParts= soupParsed.findAll("div",{"style":"margin-left:0.5em;"})

    # for the review section of each product, there are two table with class value of small
    # so table 1, 4, 7, 10 is for the product link
    # in the product section, if the product is a vine free product, there is no price tag.
    productInfo=[]
    for productIndex in range(0,10):
        try:
            #productIndex=1
            productPart = datapart[productIndex*3+1]
            productName= productPart.text
            price = productName[productName.find("Price:")+6:]
            price.replace("$","")
            productName = productName[:productName.find("Price:")]

            #print len(datapart)
            #print "----------------------------------------------"
            #print productPart
            productLink = productPart.find("a")['href']
            #print productLink
            #productInfo.append([productName,productLink])

            #reviewPartsIndex=0
            reviewPart= reviewParts[productIndex]
            starReview=reviewPart.find("img")["title"]

            #print starReview
            reviewTime = reviewPart.find("nobr").text

            isVerifiedPurchase="Not Verified"

            CountVerifiedPurchase = reviewPart.findAll("span",{"class":"crVerifiedStripe"})
            if CountVerifiedPurchase:
                #isVerifiedPurchase = reviewPart.find("span",{"class":"crVerifiedStripe"}).text
                isVerifiedPurchase ="Verified Purchase"

            print 'isVerifiedPurchase:',isVerifiedPurchase,'\n'

            #print reviewTime
            AllReviewText =  reviewParts[productIndex].text
            if "Vine Customer Review of Free Product" in AllReviewText:
                IsVineReviewFreeProduct="YesVineReviewFreeProduct"
            else:
                IsVineReviewFreeProduct="NoVineReviewFreeProduct"
            #print IsVineReviewFreeProduct
            reviewText=reviewPart.find("div",{"class":"reviewText"}).text
            print "productName,",productName,"\n"
            print "productLink:\n", productLink,"\n"
            print "starReview:\n",starReview,"\n"
            print "reviewTime:\n",reviewTime,"\n"
            print "IsVineReviewFreeProduct:\n",IsVineReviewFreeProduct,"\n"
            print reviewText
            print "------------------------------------------------------------"
            productInfo.append([rankIndexValue, 	name ,	reviewerID ,	TotalReviews ,	HelpfulVotes ,	crNumPercentHelpful ,	crNumFanVoters,productName,price,productLink,starReview,reviewTime,isVerifiedPurchase,IsVineReviewFreeProduct,reviewText])
        except:
            print 'failed this product section'
        #productReviewInfo=zip(productInfo,reviewInfo)
        #break
    return productInfo

Пример #33

0

Показать файл

Файл: default.py Проект: coolnaz85/codenx-xbmc-addons

      return None

def Load_Video( url ):
   print "Load_Video=" + url
   try:
      response = urllib2.urlopen(url)
      html = response.read()
   except urllib2.HTTPError, e:
      html = e.fp.read()
      pass

   soup = BeautifulSoup( html )
   sourceVideos = []

   # Handle href tags
   for a in soup.findAll('a', href=True):
      if a['href'].find("youtu.be") != -1:
         sourceVideos.append( a['href'].split()[0] )
         
      if a['href'].find("youtube") != -1:
         sourceVideos.append( a['href'].split()[0] )
         
      if a['href'].find("dailymotion") != -1:
         sourceVideos.append( a['href'].split()[0] )

      if a['href'].find("tamildbox") != -1:
         src = a['href'].split()[0]
         print "tamildbox", src
         resp = net.http_GET( src )
         dbox = resp.content
         sourceVideos += re.compile( '<iframe(.+?)>').findall( dbox )

Пример #34

0

Показать файл

Файл: mangacrawler.py Проект: venkata-s-munnangi/WebCrawler_Python

#lists for mangas.
urlChapters = [] # list of urls.
urlChapter = [] # url of current chapter.
urlImage = [] # list of images.

request = urllib2.Request(target_url+'/'+ target_info, headers = headers)
response = urllib2.urlopen(request)
doc = response.read()
#print doc ### prints the source of the page.
soupDoc = BeautifulSoup(doc) # soup to parse the html.
#soup = soup.prettify()

# resource from http://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautiful-soup
#'a' for all the items, then href for the links.
links = soupDoc.findAll('a') #making a list of all Links on manga.

#print links

for link in links:
	#print link
	urlChapters.append(target_url+link['href']) #href - relates the links (look at page source)
	
def getChapter(url):
	'''
	A method to get all the usable links to later parse only the images.
	'''
	print url
	chapterurl = url #the url of the chapter.
	reqChapter = urllib2.Request(chapterurl,headers = headers)#requesting.
	responseChapter = urllib2.urlopen(reqChapter) #returns the request.

Пример #35

0

Показать файл

Файл: wikiquote.py Проект: nikolas/jenni

def random_quote(jenni, cat):
    if cat is not None:
        if cat not in CATS:
            jenni.say("I don't know that category, please select from one of: {0}".format(', '.join(CATS)))
            return
    else:
        cat = CATS[randrange(len(CATS))]

    page_title = page_id = None

    # First drill down to the lowest category
    while(True):
        try:
            cat_url = BASE_URL + SUBCATS % cat
            content = json.loads(urllib.urlopen(cat_url).read())
            cat_members = content["query"]["categorymembers"]

            # Select at random
            random_member = choose_random_member(cat_members)
            if random_member is None:
                jenni.say("An error occurred fetching a subcategory")
                return
            
            if random_member["type"] == "subcat":
                cat = random_member["title"]
            else:
                page_title = random_member["title"]
                page_id = random_member["pageid"]
                break
        except Exception as e:
            jenni.say("An error occurred fetching a quote: {0}".format(e))
            return

    # Next select a random quote from the page
    try:
        page_url = BASE_URL + SECTIONS % page_id
        content = json.loads(urllib.urlopen(page_url).read())
        sections = content["parse"]["sections"]

        quote = None
        num_tries = 0
        while quote == None and num_tries < MAX_TRIES:
            section = choose_random_section(sections)
    
            if section is None:
                jenni.say("We accidentally chose a page with no quotes, sorry about that!")
                return
    
            section_index = randrange(len(sections)) + 1
    
            section_url = BASE_URL + SECTION % (page_id, section_index)
            content = json.loads(urllib.urlopen(section_url).read())
            section_title = content["parse"]["title"]
            html = Soup(content["parse"]["text"]["*"])
            all_quotes = []
            for ul in html.findAll('ul'):
                for li in ul.findAll('li'):
                    all_quotes.append(li.text)
    
            for dd in html.findAll('dd'):
                all_quotes.append(dd.text.replace("<b>","").replace("</b>",""))
    
            len_all_quotes = len(all_quotes)
            if len_all_quotes == 0:
                num_tries += 1
            else:
                quote = all_quotes[randrange(len_all_quotes)]
    
        if quote is None:
          jenni.say("We accidentally chose a section of a page with no quotes, sorry about that!")
          return

        jenni.say("{0}: {1}".format(section_title, quote.encode('utf-8')))
    except Exception as e:
        jenni.say("An error occurred fetching a quote: {0}".format(e))
        return

Пример #36

0

Показать файл

Файл: deleteoldmessages.py Проект: Simoblk2019/OkCupid

    threadurl = 'http://www.okcupid.com/messages?readmsg=true&threadid=' + threadid + '&folder=2'
    threadreq = urllib2.build_opener()
    threadreq.addheaders.append(OkCupid.cookietuple)
    returnedthreadpage = threadreq.open(threadurl)
    threadsoup = BeautifulSoup(returnedthreadpage.read())
    threadmsgs = threadsoup.findAll('div', {'class': 'message'})
    username = threadsoup.find('a', {
        'class': 'buddyname'
    }).text.encode("utf-8")
    receiverid = threadsoup.find(
        'input', {'name': 'receiverid'})['value'].encode("utf-8")
    if len(threadmsgs) == 2:
        deletethread(threadid, username, receiverid)


if __name__ == '__main__':
    pagecount = 1
    while True:
        request = urllib2.build_opener()
        request.addheaders.append(OkCupid.cookietuple)
        sentmessagespage = request.open(
            'http://www.okcupid.com/messages?low=' + str(pagecount) +
            '&folder=2')
        soup = BeautifulSoup(sentmessagespage.read())
        msghtml = soup.findAll('li', {'class': re.compile('^readMessage')})
        if len(msghtml) == 0:
            break
        for thread in msghtml:
            checkthread(thread['id'].encode('utf-8').replace('message_', ''))
        pagecount += 30

Пример #37

0

Показать файл

Файл: sue-soup-tidy.py Проект: pytool/scripts2

    #just checking my urls - Accidentally created infinite loop. funtimes.
    #print baseurl3
 
# grab lyrics
    lyrics = soup2.findAll("div",{"class":"body_lyr"})
 
# No lyrics? stop program!
    if(len(lyrics) == 0):
        exit;
# for each article...
    for entry in lyrics:
        mc2 = str(entry)
    # find the name of the product
        mosoup = BeautifulSoup(mc2)
        #remove comments &amp; all other stuff
        comments = mosoup.findAll(text=lambda text:isinstance(text, Comment))
        [comment.extract() for comment in comments]
 
        for script in mosoup("script"):
            mosoup.script.extract()
        for style in mosoup("style"):
            mosoup.style.extract()
        for iframe in mosoup("iframe"):
            mosoup.iframe.extract()
        for h4 in mosoup("h4"):
            mosoup.h4.extract()
        for h5 in mosoup("h5"):
            mosoup.h5.extract()
        for h2 in mosoup("h2"):
            mosoup.h2.extract()
        for a in mosoup("a"):

Пример #38

0

Показать файл

Файл: csv_file.py Проект: yomicchi/smartninja_homework

soup = BeautifulSoup(response)

print soup.html.head.title.string


class Person:
    def __init__(self, vorname, nachname, email, city):
        self.vorname = vorname
        self.nachname = nachname
        self.email = email
        self.city = city


persons = []

for link in soup.findAll("a"):
    if link.string == "See full profile":
        person_url = "https://scrapebook22.appspot.com" + link["href"]
        person_html = urlopen(person_url).read()
        person_soup = BeautifulSoup(person_html)

        name = person_soup.find("div", attrs={"class": "col-md-8"}).h1.string
        email = person_soup.find("span", attrs={"class": "email"}).string
        city = person_soup.find("span", attrs={"data-city": True}).string

        vorname, nachname = name.split(" ")

        person = Person(vorname=vorname,
                        nachname=nachname,
                        email=email,
                        city=city)

Пример #39

0

Показать файл

Файл: scraping.py Проект: Robotcook/Python

from BeautifulSoup import BeautifulSoup
from urllib2 import urlopen

url = 'https://scrapebook22.appspot.com/'
response = urlopen(url).read()
soup = BeautifulSoup(response)

csv_file = open("email_list.csv", "w")

for link in soup.findAll("a"):
    if link.string == "See full profile":
        link_url = "https://scrapebook22.appspot.com/" + link["href"]
        link_html = urlopen(link_url).read()
        link_soup = BeautifulSoup(link_html)
        name = link_soup.findAll("h1")[1].string
        email = link_soup.find("span", attrs={"class": "email"}).string
        city = link_soup.find("span", attrs={"data-city": True}).string
        csv_file.write(name + "," + email + "," + city + "\n")

Пример #40

0

Показать файл

def downloadFirstAlbumImage(albumUrl, localFileName):
    albumSource = requests.get(albumUrl).text
    soup = BeautifulSoup(albumSource)
    matches = soup.findAll('link', rel='image_src')
    downloadImage(matches[0]['href'], localFileName)

Пример #41

0

Показать файл

Файл: hoodline_utils.py Проект: abhijit-15/hoodline

def HTML_TO_TEXT(html):
    soup = BeautifulSoup(html)
    text_parts = soup.findAll(text=True)
    text = ''.join(text_parts)
    return text

Пример #42

0

Показать файл

import re, os

report_date = date(2008, 12, 31)
crawl_date = date(2009, 3, 17)

nonnumeric = re.compile('[^0-9]')


for institution in Institution.objects.exclude(ots_number=''):
    
    if institution.institutionassethistory_set.all().count() == 0 and institution.transaction_set.all().count() > 0:
        
        url = 'http://www.ots.treas.gov/?p=InstitutionSearch&hid=%s' % (institution.ots_number)
        data = urllib.urlopen(url)
        
        soup = BeautifulSoup(data.read()) 
        
        print institution.ots_number
        
        institutionProfile = soup.findAll("table", attrs={"class":"institutionProfile"})
        
        if len(institutionProfile) > 0:
            assets = nonnumeric.sub('', institutionProfile[0].findAll("td")[4].string.strip())
            
            print assets
            
            InstitutionAssetHistory.objects.create(institution=institution,
                                               report_date=report_date,
                                               crawl_date=crawl_date,
                                               total_deposits=None,
                                               total_assets=assets)

Пример #43

0

Показать файл

import urllib2
import re
from BeautifulSoup import BeautifulSoup
from htmltreediff import diff
from kitchen.text.converters import to_unicode


f1 = open('travel.html', 'rw')

v1 = f1.read()

#def scrape(url=None, keywords=[], frequency=None, email=None):
# User inputs URL
#page = urllib2.urlopen("http://labor.ny.gov/app/warn/")
page = urllib2.urlopen("http://travel.state.gov/travel/cis_pa_tw/tw/tw_1764.html")


# Scrape URL, all of it
soup = BeautifulSoup(page)

# Find keywords
#warns_nyc = soup.findAll(text=re.compile("New York City"))
warns_travel = soup.findAll(text=re.compile("Lebanon"))


print diff(to_unicode(v1), to_unicode(page), pretty=True)


f1.close()

Пример #44

0

Показать файл

        print ii
    doc = db[id]
    if doc.has_key('type') and doc['type'] == 'decision' and doc.has_key(
            '_attachments'):
        atts = doc['_attachments']
        hts = atts.keys()
        at = db.get_attachment(id, hts[0])
        s = at.read()

        soup = BeautifulSoup(s)

        court_found = False

        # Search first in the "court" class
        courts_html = br.sub(
            ' ', str(" ".join([str(ss) for ss in soup.findAll("p", "court")])))

        # Standard
        court = cir.findall(courts_html)
        if len(court) > 0:
            court_found = True
            court_names[court[0].lower()] = 1

        # Customs & Patent
        if not court_found:
            court = cus.findall(courts_html)
            if len(court) > 0:
                court_found = True
                court_names[court[0].lower()] = 1

        # Claims

Пример #45

0

Показать файл

Файл: knight.py Проект: Cvalladares/Newsblur_Instrumented

def find_entries():
    page = 1
    total_entry_count = 0
    entries = []

    while True:
        print " ---> Found %s entries so far. Now on page: %s" % (len(entries),
                                                                  page)

        knight_url = "http://newschallenge.tumblr.com/page/%s" % (page)
        html = requests.get(knight_url).content
        soup = BeautifulSoup(html)
        postboxes = soup.findAll("div", "postbox")

        # Done if only sticky entry is left.
        if len(postboxes) <= 1:
            break

        page += 1

        # 15 entries per page, plus a sticky throwaway entry
        for entry in postboxes:
            if 'stickyPost' in entry.get('class'): continue

            total_entry_count += 1
            likes = entry.find("", "home-likes")
            if likes and likes.text:
                likes = int(likes.text)
            else:
                likes = 0

            comments = entry.find("", "home-comments")
            if comments and comments.text:
                comments = int(comments.text)
            else:
                comments = 0

            title = entry.find("h2")
            if title:
                title = title.text

            url = entry.find('a', "home-view")
            if url:
                url = url.get('href')

            # Only record active entries
            if comments or likes:
                entries.append({
                    'likes': likes,
                    'comments': comments,
                    'title': title,
                    'url': url,
                })
        # time.sleep(random.randint(0, 2))

    entries.sort(key=lambda e: e['comments'] + e['likes'])
    entries.reverse()
    active_entry_count = len(entries)

    found_entries = []
    winner_count = 0
    for i, entry in enumerate(entries):
        is_winner = entry['url'] in winners
        if is_winner: winner_count += 1
        print " * %s#%s: %s likes - [%s](%s)%s" % (
            "**" if is_winner else "", i + 1, entry['likes'], entry['title'],
            entry['url'], "**" if is_winner else "")
        found_entries.append(entry)

    print " ***> Found %s active entries among %s total applications with %s/%s winners." % (
        active_entry_count, total_entry_count, winner_count, len(winners))
    return found_entries

Пример #46

0

Показать файл

Файл: v1_http_client.py Проект: yuletak/Jenkins-test

#print headers

# get all test cases
resp, content = html.request(TEST_CASE_VO_URL, "GET", headers=headers)

#pp(content)

tcs = content.split('Data/Test/')
for tc in tcs:
    bs = BeautifulSoup(tc)
    category = bs.find(name='attribute',attrs={'name':'Category.Name'})
    if category:
        # Only pick Automated test cases
        if category.text == 'Automated':
            tcid = tc.split('" id=')[0]
            att_tokens = bs.findAll('attribute')
            attrs_dict = {}
            for att_token in att_tokens:
                attrs_dict[att_token['name']] = \
                    remove_html_tags(html_decoder(att_token.text))
                print '{name}: "{text}"'.format(name=att_token['name'],
                        text=attrs_dict[att_token['name']])

            # Execute test steps
            steps = attrs_dict['Steps'].split('\n')
            tc_passed = True
            tc_actual_output = ''
            for step in steps:
                if step:
                    print step
                    status, output = commands.getstatusoutput(step)

Пример #47

0

Показать файл

Файл: italian_tv_schedules.py Проект: yuandra/scraperwiki-scraper-vault

    'mediaset extra':'http://www.staseraintv.com/programmi_stasera_mediaset_extra.html',
    'sportitalia':'http://www.staseraintv.com/programmi_stasera_sportitalia.html',
    'cielo':'http://www.staseraintv.com/programmi_stasera_cielo.html',
    'italia 1':'http://www.staseraintv.com/programmi_stasera_italia1.html',
    'italia 2':'http://www.staseraintv.com/programmi_stasera_italia2.html',
    'mtv':'http://www.staseraintv.com/programmi_stasera_mtv.html',
    'la 7':'http://www.staseraintv.com/programmi_stasera_la7.html',
    'la 7D':'http://www.staseraintv.com/programmi_stasera_la7d.html'

}
day = datetime.now(timezone('Europe/Rome')).strftime('%Y-%m-%d')
sourcedata = 'italian_tv_schedules'
for tv in television.keys():
    html = scraperwiki.scrape(television[tv])
    soup = BeautifulSoup(html)
    tds = soup.findAll('table')[4].findAll('tr')[0].findAll('td')
    schedule_table = tds[len(tds)-1].findAll('small')
    schedule = schedule_table[len(schedule_table)-1]
    for s in schedule:
        v = str(s).replace('\r\n','')
        if (v.find(' - ') > 1):
            info = v.split(' - ')
            if (len(info) > 2):
                what = ""
                for i in range(len(info)):
                    if (i > 1):
                        what += " - " + info[i]
                    else:
                        if (i == 1):
                            what += info[i]
            else:

Пример #48

0

Показать файл

Файл: LeahyScript.py Проект: erossiter/WUSTL

month['04'] = 'Apr'
month['05'] = 'May'
month['06'] = 'Jun'
month['07'] = 'Jul'
month['08'] = 'Aug'
month['09'] = 'Sep'
month['10'] = 'Oct'
month['11'] = 'Nov'
month['12'] = 'Dec'



for j in range(len(html)):
        out = urlopen(html[j]).read()
        soup = BeautifulSoup(out)
        res  = soup.findAll('a')
        fr= []
        date=[]
        for k in range(len(res)):
            if res[k].has_key('href'):
                ab = res[k]['href']
                ab = ab.strip('..')
                ba = re.findall('\d\d\d\d\d\d/\d\d\d\d\d\d[a-z]\.html|\d\d\d\d\d\d/\d\d\d\d\d\d\.html', str(ab))
                if len(ba)>0 :
                    fr.append(ab.encode('UTF-8'))
                    att = ab
                    emp = re.findall('\d\d\d\d\d\d[a-z]\.html|\d\d\d\d\d\d\.html', str(att))
                    almost = re.sub('\W', '', emp[0].strip('.html'))
                    almost = almost.strip('a').strip('b').strip('c').strip('d').strip('e').strip('f')
                    mons = month[almost[0:2]]
                    day = almost[2:4]

Пример #49

0

Показать файл

Файл: feed_fetcher.py Проект: steg23/NewsBlur

    def fetch_youtube(self, address):
        username = None
        channel_id = None
        list_id = None
        
        if 'gdata.youtube.com' in address:
            try:
                username_groups = re.search('gdata.youtube.com/feeds/\w+/users/(\w+)/', address)
                if not username_groups:
                    return
                username = username_groups.group(1)
            except IndexError:
                return
        elif 'youtube.com/feeds/videos.xml?user='******'user'][0]
            except IndexError:
                return            
        elif 'youtube.com/feeds/videos.xml?channel_id=' in address:
            try:
                channel_id = urlparse.parse_qs(urlparse.urlparse(address).query)['channel_id'][0]
            except (IndexError, KeyError):
                return            
        elif 'youtube.com/playlist' in address:
            try:
                list_id = urlparse.parse_qs(urlparse.urlparse(address).query)['list'][0]
            except IndexError:
                return            
        elif 'youtube.com/feeds/videos.xml?playlist_id' in address:
            try:
                list_id = urlparse.parse_qs(urlparse.urlparse(address).query)['playlist_id'][0]
            except IndexError:
                return            
        
        if channel_id:
            video_ids_xml = requests.get("https://www.youtube.com/feeds/videos.xml?channel_id=%s" % channel_id)
            channel_json = requests.get("https://www.googleapis.com/youtube/v3/channels?part=snippet&id=%s&key=%s" %
                                       (channel_id, settings.YOUTUBE_API_KEY))
            channel = json.decode(channel_json.content)
            try:
                username = channel['items'][0]['snippet']['title']
                description = channel['items'][0]['snippet']['description']
            except (IndexError, KeyError):
                return
        elif list_id:
            playlist_json = requests.get("https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=%s&key=%s" %
                                       (list_id, settings.YOUTUBE_API_KEY))
            playlist = json.decode(playlist_json.content)
            try:
                username = playlist['items'][0]['snippet']['title']
                description = playlist['items'][0]['snippet']['description']
            except (IndexError, KeyError):
                return
            channel_url = "https://www.youtube.com/playlist?list=%s" % list_id
        elif username:
            video_ids_xml = requests.get("https://www.youtube.com/feeds/videos.xml?user=%s" % username)
            description = "YouTube videos uploaded by %s" % username
        else:
            return
                    
        if list_id:
            playlist_json = requests.get("https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId=%s&key=%s" %
                                       (list_id, settings.YOUTUBE_API_KEY))
            playlist = json.decode(playlist_json.content)
            try:
                video_ids = [video['snippet']['resourceId']['videoId'] for video in playlist['items']]
            except (IndexError, KeyError):
                return
        else:    
            if video_ids_xml.status_code != 200:
                return
            video_ids_soup = BeautifulSoup(video_ids_xml.content)
            channel_url = video_ids_soup.find('author').find('uri').getText()
            video_ids = []
            for video_id in video_ids_soup.findAll('yt:videoid'):
                video_ids.append(video_id.getText())
        
        videos_json = requests.get("https://www.googleapis.com/youtube/v3/videos?part=contentDetails%%2Csnippet&id=%s&key=%s" %
             (','.join(video_ids), settings.YOUTUBE_API_KEY))
        videos = json.decode(videos_json.content)
        if 'error' in videos:
            logging.debug(" ***> ~FRYoutube returned an error: ~FM~SB%s" % (videos))
            return
            
        data = {}
        data['title'] = ("%s's YouTube Videos" % username if 'Uploads' not in username else username)
        data['link'] = channel_url
        data['description'] = description
        data['lastBuildDate'] = datetime.datetime.utcnow()
        data['generator'] = 'NewsBlur YouTube API v3 Decrapifier - %s' % settings.NEWSBLUR_URL
        data['docs'] = None
        data['feed_url'] = address
        rss = feedgenerator.Atom1Feed(**data)
        
        for video in videos['items']:
            thumbnail = video['snippet']['thumbnails'].get('maxres')
            if not thumbnail:
                thumbnail = video['snippet']['thumbnails'].get('high')
            if not thumbnail:
                thumbnail = video['snippet']['thumbnails'].get('medium')
            duration_sec = isodate.parse_duration(video['contentDetails']['duration']).seconds
            if duration_sec >= 3600:
                hours = (duration_sec / 3600)
                minutes = (duration_sec - (hours*3600)) / 60
                seconds = duration_sec - (hours*3600) - (minutes*60)
                duration = "%s:%s:%s" % (hours, '{0:02d}'.format(minutes), '{0:02d}'.format(seconds))
            else:
                minutes = duration_sec / 60
                seconds = duration_sec - (minutes*60)
                duration = "%s:%s" % ('{0:02d}'.format(minutes), '{0:02d}'.format(seconds))
            content = """<div class="NB-youtube-player"><iframe allowfullscreen="true" src="%s?iv_load_policy=3"></iframe></div>
                         <div class="NB-youtube-stats"><small>
                             <b>From:</b> <a href="%s">%s</a><br />
                             <b>Duration:</b> %s<br />
                         </small></div><hr>
                         <div class="NB-youtube-description">%s</div>
                         <img src="%s" style="display:none" />""" % (
                ("https://www.youtube.com/embed/" + video['id']),
                channel_url, username,
                duration,
                linkify(linebreaks(video['snippet']['description'])),
                thumbnail['url'] if thumbnail else "",
            )

            link = "http://www.youtube.com/watch?v=%s" % video['id']
            story_data = {
                'title': video['snippet']['title'],
                'link': link,
                'description': content,
                'author_name': username,
                'categories': [],
                'unique_id': "tag:youtube.com,2008:video:%s" % video['id'],
                'pubdate': dateutil.parser.parse(video['snippet']['publishedAt']),
            }
            rss.add_item(**story_data)
        
        return rss.writeString('utf-8')

Пример #50

0

Показать файл

Файл: upload_drugstats.py Проект: faebug/batchuploads

		#print Fore.YELLOW + dd, Fore.WHITE
		#c[1].show()
		home = os.path.expanduser("~")
		local = home + "/Downloads/TEMP/"+filename
		c[1].write_image(local, format="svg", width=1024, height = 768)
		comment = "[[User_talk:Fæ/DrugStats|DrugStats]] chart for {} {}".format(title, date)
		pywikibot.setAction(comment)
		if len(sys.argv)<2:
			up(local, filename, dd, comment, True)
		remove(local)
	return True

topurl = "https://clincalc.com/DrugStats/Top300Drugs.aspx"
html = urllib2.urlopen(topurl).read()
soup = BeautifulSoup(html)
drugs = soup.findAll('a', href=re.compile('Drugs/.*'))
print Fore.GREEN + NOTICE
print '*'*80
print Fore.CYAN, soup.find('meta', {'name':'description'})['content']
print Fore.CYAN, "Drugs found", len(drugs), Fore.WHITE
count = 0
'''for drug in drugs:
	count += 1
	upthisdrug(drug)'''

chars="abcdefghijklmnopqrstuvwxyz";charx=[];charxx=[]
for i in range(0, len(chars)):
	charx.append(chars[i])
for a in charx:
	for b in charx:
		for c in charx:

Пример #51

0

Показать файл

    def get_simbad_literature_refs(self, srcid_class_match_dict):
        """ Query and retrieve SIMBAD literature references for Simbad matched ASAS sources.
        """
        import urllib
        from BeautifulSoup import BeautifulSoup, Comment

        litrefs_init_fpath = '/home/dstarr/scratch/determine_simbad__orig/src_litrefs.pkl'
        if os.path.exists(litrefs_init_fpath):
            fp = open(litrefs_init_fpath, 'rb')
            src_litrefs = cPickle.load(fp)
            fp.close()
        else:
            src_litrefs = {}
        srcid_list = srcid_class_match_dict.keys()
        srcid_list.sort()
        n_srcid = len(srcid_list)
        for i, src_id in enumerate(srcid_list):
            src_dict = srcid_class_match_dict[src_id]
            if src_dict['main_id'].count(' ') > 0:
                ### Source names return random literature results if ' ' is not replaced with '+'
                src_litrefs[src_id] = {}
            else:
                ### This assumes that src_litrefs[src_id] has been filled previously
                continue  # we skip re-retrieving this source.
            src_name = src_dict['main_id'].replace(' ', '+')
            #url_str_new = "http://simbad.u-strasbg.fr/simbad/sim-id?Ident=%s&NbIdent=1&Radius=2&Radius.unit=arcmin&submit=submit+id" % (src_name)

            url_str = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=AST&db_key=PRE&qform=AST&arxiv_sel=astro-ph&arxiv_sel=cond-mat&arxiv_sel=cs&arxiv_sel=gr-qc&arxiv_sel=hep-ex&arxiv_sel=hep-lat&arxiv_sel=hep-ph&arxiv_sel=hep-th&arxiv_sel=math&arxiv_sel=math-ph&arxiv_sel=nlin&arxiv_sel=nucl-ex&arxiv_sel=nucl-th&arxiv_sel=physics&arxiv_sel=quant-ph&arxiv_sel=q-bio&sim_query=YES&ned_query=YES&adsobj_query=YES&obj_req=YES&aut_logic=OR&obj_logic=OR&author=&object=%s&start_mon=&start_year=&end_mon=&end_year=&ttl_logic=OR&title=&txt_logic=OR&text=&nr_to_return=200&start_nr=1&jou_pick=ALL&ref_stems=&data_and=ALL&group_and=ALL&start_entry_day=&start_entry_mon=&start_entry_year=&end_entry_day=&end_entry_mon=&end_entry_year=&min_score=&sort=SCORE&data_type=SHORT&aut_syn=YES&ttl_syn=YES&txt_syn=YES&aut_wt=1.0&obj_wt=1.0&ttl_wt=0.3&txt_wt=3.0&aut_wgt=YES&obj_wgt=YES&ttl_wgt=YES&txt_wgt=YES&ttl_sco=YES&txt_sco=YES&version=1" % (
                src_name)  #(src_dict['main_id'])
            f_url = urllib.urlopen(url_str)
            webpage_str = f_url.read()
            f_url.close()

            soup = BeautifulSoup(webpage_str)
            comments = soup.findAll(
                text=lambda text: isinstance(text, Comment))
            [comment.extract() for comment in comments]
            #print soup.html.body.form.find('table')
            #print '------------'
            #print soup.html.body.form.findAll('table')[1].table.tbody.findAll('tr')
            #soup.html.body.form.findAll('table')[1].extract()
            #bib_rows = soup.html.body.form.fetch('table')[1].fetch('tr')
            #print soup
            try:
                bib_rows = soup.html.body.form('table', limit=2)[1]('tr')
                print 'parsed:', i, n_srcid, src_id, src_dict['main_id']
            except:
                # likely no results returned
                #print 'len(soup.html.body.form.table):', len(soup.html.body.form.table)
                print 'skip:  ', i, n_srcid, src_id, src_dict['main_id']
                continue
            for r in bib_rows:
                for td in r('td'):
                    x = td.input
                    if x == None:
                        continue
                    bibcode = x['value']
                    abstract_url = td.a['href']
                    # NOTE: I could probably extract some author names, title
                    src_litrefs[src_id][bibcode] = abstract_url
                    #print bibcode, abstract_url
            #import pdb; pdb.set_trace()
            #print
            #fp = open('/tmp/124', 'w')
            #fp.write(webpage_str)
            #fp.close()
            #import pdb; pdb.set_trace()
            #print
            #elemtree = ElementTree.fromstring(webpage_str)
            #xmld_data = xmldict.ConvertXmlToDict(elemtree)
            #b = xmld_data['HTML']['body']['form']
            if (i % 500) == 0:
                fp = open('/tmp/src_litrefs_%d.pkl' % (i), 'wb')
                cPickle.dump(src_litrefs, fp,
                             1)  # ,1) means a binary pkl is used.
                fp.close()

        import pdb
        pdb.set_trace()
        print
        fp = open('/tmp/src_litrefs.pkl', 'wb')
        cPickle.dump(src_litrefs, fp, 1)  # ,1) means a binary pkl is used.
        fp.close()

Пример #52

0

Показать файл

Файл: data_scrapping_zadaca13_2.py Проект: Ninica90/data_scrapping

from BeautifulSoup import BeautifulSoup

from urllib2 import urlopen

import random

url = "https://quotes.yourdictionary.com/theme/marriage/"   #obavezno slash treba staviti na kraj

response = urlopen(url).read()

soup = BeautifulSoup(response)
quotes = []                               

for quote in soup.findAll("p", attrs={"class": "quoteContent"}):       #trazi quotove klase quoteContent
	quotes.append(quote.string)                                        # apendaj stringove iz quotova u listu quotes
#print len(quotes)

available_quotes = []                                                  
while len(available_quotes) < 5:                                        # treba ispisati samo 5 random quotova
	random_quote_index = random.randint(0, len(quotes) - 1)             # random_quote_index je int; 29 quotova, pocinje brojati od 0 pa je max range - 1
	random_quote = quotes[random_quote_index]                           # nova varijabla = trazimo quote na nekom random rednom broju u listi quotes
	if random_quote_index not in available_quotes:                      # kako ne duplicirati iste iteme iz liste!
		available_quotes.append(random_quote_index)                     # appenda samo distinct vrijednosti
		print "- " + random_quote

Пример #53

0

Показать файл

Файл: snippet.py Проект: someburner/GistsHub

class ScholarParser():
    """
    ScholarParser can parse HTML document strings obtained from Google
    Scholar. It invokes the handle_article() callback on each article
    that was parsed successfully.
    """
    SCHOLAR_SITE = 'http://scholar.google.com'

    def __init__(self, site=None):
        self.soup = None
        self.article = None
        self.site = site or self.SCHOLAR_SITE

    def handle_article(self, art):
        """
        In this base class, the callback does nothing.
        """

    def parse(self, html):
        """
        This method initiates parsing of HTML content.
        """
        self.soup = BeautifulSoup(html)
        for div in self.soup.findAll(ScholarParser._tag_checker):
            self._parse_article(div)

    def _parse_article(self, div):
        self.article = Article()

        for tag in div:
            if not hasattr(tag, 'name'):
                continue

            if tag.name == 'div' and tag.get('class') == 'gs_rt' and \
                    tag.h3 and tag.h3.a:
                self.article['title'] = ''.join(tag.h3.a.findAll(text=True))
                self.article['url'] = self._path2url(tag.h3.a['href'])

            if tag.name == 'font':
                for tag2 in tag:
                    if not hasattr(tag2, 'name'):
                        continue
                    if tag2.name == 'span' and tag2.get('class') == 'gs_fl':
                        self._parse_links(tag2)

        if self.article['title']:
            self.handle_article(self.article)

    def _parse_links(self, span):
        for tag in span:
            if not hasattr(tag, 'name'):
                continue
            if tag.name != 'a' or tag.get('href') == None:
                continue

            if tag.get('href').startswith('/scholar?cites'):
                if hasattr(tag,
                           'string') and tag.string.startswith('Cited by'):
                    self.article['num_citations'] = \
                        self._as_int(tag.string.split()[-1])
                self.article['url_citations'] = self._path2url(tag.get('href'))

            if tag.get('href').startswith('/scholar?cluster'):
                if hasattr(tag, 'string') and tag.string.startswith('All '):
                    self.article['num_versions'] = \
                        self._as_int(tag.string.split()[1])
                self.article['url_versions'] = self._path2url(tag.get('href'))

    @staticmethod
    def _tag_checker(tag):
        if tag.name == 'div' and tag.get('class') == 'gs_r':
            return True
        return False

    def _as_int(self, obj):
        try:
            return int(obj)
        except ValueError:
            return None

    def _path2url(self, path):
        if path.startswith('http://'):
            return path
        if not path.startswith('/'):
            path = '/' + path
        return self.site + path

Пример #54

0

Показать файл

Файл: politics.py Проект: yuandra/scraperwiki-scraper-vault

# Blank Python

school_list = ['alabama-a--and--m-university','university-of-alabama----birmingham','university-of-alabama----huntsville','auburn-university','university-of-north-alabama','samford-university','University-of-South-Alabama','columbia-southern-university','alabama-state-university','auburn-university----montgomery','jacksonville-state-university','university-of-montevallo','oakwood-university','university-of-alabama','birmingham--southern-college','troy-university','tuskegee-university','university-of-alaska-anchorage','university-of-alaska-fairbanks','grand-canyon-university','university-of-advancing-technology','arizona-state-university','university-of-arizona','embry--riddle-aeronautical-university----prescott','northern-arizona-university','university-of-phoenix----phoenix--hohokam','university-of-phoenix----southern-arizona','university-of-arkansas-at-little-rock','university-of-arkansas','harding-university','arkansas-state-university','philander-smith-college','arkansas-tech-university','hendrix-college','university-of-the-ozarks','california-baptist-university','california-institute-of-technology','california-lutheran-university','california-state-university----stanislaus','california-state-university----san-bernardino','california-state-university----chico','california-state-university----dominguez-hills','california-state-university----fresno','california-state-university----fullerton','california-state-university----northridge','university-of-california----davis','university-of-california----irvine','university-of-california----los-angeles','university-of-california----santa-barbara','university-of-la-verne','loyola-marymount-university','mills-college','occidental-college','university-of-the-pacific','pepperdine-university','pomona-college','university-of-redlands','san-diego-state-university','university-of-san-francisco','san-jose-state-university','scripps-college','sonoma-state-university','vanguard-university-of-southern-california','university-of-southern-california','california-state-university----san-marcos','california-polytechnic-state-university----san-luis-obispo','california-state-university----east-bay','california-state-university----sacramento','university-of-california----riverside','university-of-california----san-diego','university-of-california----santa-cruz','chapman-university','claremont-mckenna-college','harvey-mudd-college','humboldt-state-university','pitzer-college','university-of-san-diego','santa-clara-university','stanford-university','azusa-pacific-university','biola-university','california-state-university----bakersfield','california-state-polytechnic-university----pomona','california-state-university----long-beach','california-state-university----los-angeles','university-of-california----berkeley','pacific-union-college','point-loma-nazarene-university','san-francisco-state-university','whittier-college','california-state-university----monterey-bay','university-of-colorado-denver','university-of-colorado-at-colorado-springs','university-of-colorado----boulder','colorado-school-of-mines','colorado-state-university','university-of-denver','colorado-mesa-university','metropolitan-state-college-of-denver','colorado-state-university----pueblo','colorado-college','university-of-northern-colorado','regis-university','university-of-bridgeport','fairfield-university','university-of-hartford','university-of-new-haven','quinnipiac-university','sacred-heart-university','southern-connecticut-state-university','trinity-college','western-connecticut-state-university','yale-university','connecticut-college','university-of-connecticut','wesleyan-university','central-connecticut-state-university','delaware-state-university','university-of-delaware','american-university','catholic-university-of-america','george-washington-university','howard-university','georgetown-university','university-of-the-district-of-columbia','barry-university','bethune--cookman-university','university-of-central-florida','eckerd-college','embry--riddle-aeronautical-university----daytona-beach','florida-a-and-m-university','florida-international-university','florida-southern-college','florida-state-university','university-of-miami','university-of-north-florida','nova-southeastern-university','rollins-college','university-of-south-florida','stetson-university','university-of-tampa','new-college-of-florida','jacksonville-university','saint-leo-university','southeastern-university----florida','university-of-west-florida','ave-maria-university','south-university----tampa','florida-atlantic-university','florida-institute-of-technology','university-of-florida','florida-gulf-coast-university','albany-state-university','augusta-state-university','columbus-state-university','emory-university','fort-valley-state-university','georgia-institute-of-technology','georgia-college--and--state-university','university-of-georgia','north-georgia-college--and--state-university','valdosta-state-university','georgia-gwinnett-college','agnes-scott-college','armstrong-atlantic-state-university','clark-atlanta-university','berry-college','covenant-college','georgia-southern-university','georgia-state-university','wesleyan-college','university-of-west-georgia','clayton-state-university','kennesaw-state-university','mercer-university','morehouse-college','oglethorpe-university','savannah-state-university','spelman-college','southern-polytechnic-state-university','university-of-hawaii-at-hilo','university-of-hawaii-at-manoa','hawaii-pacific-university','boise-state-university','idaho-state-university','brigham-young-university----idaho','university-of-idaho','augustana-college----rock-island','university-of-chicago','columbia-college-chicago','elmhurst-college','illinois-wesleyan-university','illinois-state-university','millikin-university','northwestern-university','northeastern-illinois-university','olivet-nazarene-university','southern-illinois-university-carbondale','southern-illinois-university-edwardsville','western-illinois-university','bradley-university','depaul-university','eastern-illinois-university','illinois-institute-of-technology','lewis-university','mckendree-university','north-central-college','northern-illinois-university','roosevelt-university','wheaton-college----illinois','aurora-university','chicago-state-university','university-of-illinois-at-chicago','university-of-illinois-at-urbana--champaign','loyola-university-chicago','anderson-university----indiana','butler-university','earlham-college','hanover-college','indiana-university--purdue-university----fort-wayne','university-of-indianapolis','university-of-southern-indiana','indiana-university','indiana-wesleyan-university','purdue-university----calumet','saint-marys-college','valparaiso-university','purdue-university','ball-state-university','indiana-state-university','depauw-university','indiana-university----south-bend','university-of-notre-dame','taylor-university','coe-college','drake-university','grinnell-college','university-of-iowa','dordt-college','luther-college','university-of-northern-iowa','cornell-college','iowa-state-university','kansas-state-university','emporia-state-university','university-of-kansas','wichita-state-university','bellarmine-university','berea-college','university-of-kentucky']

#print list[0]

for thisSchool in school_list:
    #print thisSchool

    try:
        page = urllib2.urlopen("http://collegeprowler.com/" + thisSchool + "/diversity/student-polls/")

        soup = BeautifulSoup(page)

        table_five = soup.findAll("table")[4]

        politics_label = table_five.findAll("td")[1].string #label
        #print politics_label

        politics_value = table_five.findAll("td")[0].string #value
        print politics_value

        

        
    
    except  (IndexError):
         print "Oops! Try again..."
    
    scraperwiki.sqlite.save

Пример #55

0

Показать файл

Файл: LogTool_Plugin.py Проект: zahlabut/LogTool

    def test_3_download_jenkins_job(selfself):
        mode_start_time=time.time()
        # Create destination directory
        destination_dir = 'Jenkins_Job_Files'
        destination_dir = os.path.join(os.path.dirname(os.path.abspath('.')), destination_dir)
        if os.path.exists(destination_dir):
            shutil.rmtree(destination_dir)
        os.mkdir(destination_dir)
        #Import BeautifulSoup
        try:
            from BeautifulSoup import BeautifulSoup
        except Exception as e:
            print_in_color(str(e), 'red')
            print_in_color('Execute "pip install beautifulsoup" to install it!', 'yellow')
            exit('Install beautifulsoup and rerun!')
        # Download logs
        response = urllib2.urlopen(artifact_url)
        html = response.read()
        parsed_url = urlparse.urlparse(artifact_url)
        base_url = parsed_url.scheme + '://' + parsed_url.netloc
        soup = BeautifulSoup(html)
        tar_gz_files = []
        ir_logs_urls = []
        tempest_log_url = None
        for link in soup.findAll('a'):
            if 'tempest-results' in link:
                tempest_results_url = urljoin(artifact_url, link.get('href'))
                tempest_response = urllib2.urlopen(tempest_results_url)
                html = tempest_response.read()
                soup = BeautifulSoup(html)
                for link in soup.findAll('a'):
                    if str(link.get('href')).endswith('.html'):
                        tempest_html = link.get('href')
                        tempest_log_url = urljoin(artifact_url, 'tempest-results') + '/' + tempest_html
                        break
            if str(link.get('href')).endswith('.tar.gz'):
                tar_gz_files.append(link)
                tar_link = urlparse.urljoin(artifact_url, link.get('href'))
                os.system('wget -P ' + destination_dir + ' ' + tar_link)
            if str(link.get('href')).endswith('.sh'):
                sh_page_link = urlparse.urljoin(artifact_url, link.get('href'))
                response = urllib2.urlopen(sh_page_link)
                html = response.read()
                soup = BeautifulSoup(html)
                for link in soup.findAll('a'):
                    if str(link.get('href')).endswith('.log'):
                        ir_logs_urls.append(sh_page_link + '/' + link.get('href'))

        # Download console.log

        console_log_url=artifact_url.strip().replace('artifact','consoleFull').strip('/')
        print console_log_url



        os.system('wget -P ' + destination_dir + ' ' + console_log_url)
        shutil.move(os.path.join(destination_dir, 'consoleFull'),os.path.join(destination_dir,'consoleFull.log'))
        # Download Infared Logs .sh, files in .sh directory on Jenkins
        if len(ir_logs_urls)!=0:
            for url in ir_logs_urls:
                os.system('wget -P ' + destination_dir + ' ' + url)
        # Download tempest log (html #)
        if tempest_log_url!=None:
            os.system('wget -P ' + destination_dir + ' ' + tempest_log_url)
            shutil.move(os.path.join(destination_dir, tempest_html),os.path.join(destination_dir,tempest_html.replace('.html','.log')))
        # Unzip all downloaded .tar.gz files
        for fil in os.listdir(os.path.abspath(destination_dir)):
            if fil.endswith('.tar.gz'):
                cmd = 'tar -zxvf ' + os.path.join(os.path.abspath(destination_dir), fil) + ' -C ' + os.path.abspath(
                    destination_dir) + ' >/dev/null' + ';' + 'rm -rf ' + os.path.join(
                    os.path.abspath(destination_dir), fil)
                print_in_color('Unzipping ' + fil + '...', 'bold')
                os.system(cmd)
                os.system('rm -rf '+fil)
        # Run LogTool analyzing
        print_in_color('\nStart analyzing downloaded OSP logs locally', 'bold')
        result_dir = 'Jenkins_Job_' + grep_string.replace(' ', '')
        if os.path.exists(os.path.abspath(result_dir)):
            shutil.rmtree(os.path.abspath(result_dir))
        result_file = os.path.join(os.path.abspath(result_dir),
                                   'LogTool_Result_' + grep_string.replace(' ', '') + '.log')



        print artifact_url
        print user_tart_time
        print undercloud_logs
        command = "python2 Extract_On_Node.py '" +user_tart_time+ "' " + os.path.abspath(
            destination_dir) + " '" + grep_string + "'" + ' ' + result_file





        # shutil.copytree(destination_dir, os.path.abspath(result_dir))
        exec_command_line_command('cp -r ' + destination_dir + ' ' + os.path.abspath(result_dir))
        print_in_color('\n --> ' + command, 'bold')
        com_result = exec_command_line_command(command)
        # print (com_result['CommandOutput'])
        end_time = time.time()
        if com_result['ReturnCode'] == 0:
            spec_print(['Completed!!!', 'You can find the result file + downloaded logs in:',
                        'Result Directory: ' + result_dir,
                        'Analyze logs execution time: ' + str(round(end_time - mode_start_time, 2)) + '[sec]'],
                       'green')
        else:
            spec_print(['Completed!!!', 'Result Directory: ' + result_dir,
                        'Analyze logs execution time: ' + str(round(end_time - mode_start_time, 2)) + '[sec]'],
                       'red')

        def test_4_create_final_report(self):
            print('\ntest_3_create_final_report')
            report_file_name = 'LogTool_Report.log'
            if report_file_name in os.listdir('.'):
                os.remove(report_file_name)
            report_data=''

            for key in workers_output:
                if 'Total_Number_Of_Errors:0' not in workers_output[key]:
                    report_data+='\n'+key+' --> '+workers_output[key]
            if len(report_data)!=0:
                append_to_file(report_file_name,report_data+
                               '\n\nFor more details, check LogTool result files on your setup:'
                               '\n'+os.path.abspath(result_dir))

Пример #56

0

Показать файл

    def get_simbad_abstracts(self, srcid_class_match_dict):
        """ Query and retrieve SIMBAD abstracts, using previously retrieved literature references for Simbad matched ASAS sources.
        """
        import urllib
        from BeautifulSoup import BeautifulSoup, Comment
        fp = open('/tmp/src_litrefs.pkl', 'rb')
        src_litrefs = cPickle.load(fp)
        fp.close()

        abs_bibcodes_dirpath = '/home/obs/scratch/determine_simbad'
        abs_bibcodes = os.listdir(abs_bibcodes_dirpath)

        abstracts_pkl_dirpath = '/home/obs/scratch/determine_simbad_abstracts.pkl'
        if os.path.exists(abstracts_pkl_dirpath):
            fp = open(abstracts_pkl_dirpath, 'rb')
            abstracts_dict = cPickle.load(fp)
            fp.close()
        else:
            abstracts_dict = {}

        srcid_list = src_litrefs.keys()
        srcid_list.sort()
        for i, src_id in enumerate(srcid_list):
            src_bib_dict = src_litrefs[src_id]
            for bibcode, abstract_url in src_bib_dict.iteritems():
                if abstracts_dict.has_key(bibcode):
                    continue  # skip since we parsed this already
                fpath = "%s/%s" % (abs_bibcodes_dirpath,
                                   bibcode.replace('/', '___'))

                # TODO: need to check that we have not parsed and place in dict
                if not bibcode in abs_bibcodes:
                    f_url = urllib.urlopen(abstract_url)
                    webpage_str = f_url.read()
                    f_url.close()
                    fp = open(fpath, 'w')
                    fp.write(webpage_str)
                    fp.close()
                else:
                    fp = open(fpath)
                    webpage_str = fp.read()
                    fp.close()

                soup = BeautifulSoup(webpage_str)
                comments = soup.findAll(
                    text=lambda text: isinstance(text, Comment))
                [comment.extract() for comment in comments]

                #print soup.html.body('p', limit=2)[1]('table', limit=2)[1].prettify()
                #import pdb; pdb.set_trace()
                #print
                try:
                    abstract_rows = soup.html.body('p', limit=2)[1](
                        'table', limit=2)[1]('tr')
                except:
                    print "skipping:", bibcode
                    continue

                for r in abstract_rows:
                    if 'Title:' in str(r('td')[0].getText()):
                        title = r('td')[2].getText()  # in UNICODE
                    elif 'Authors:' in str(r('td')[0].getText()):
                        authors = r('td')[2].getText()  # in UNICODE
                    elif 'Publication:' in str(r('td')[0].getText()):
                        publication = r('td')[2].getText()  # in UNICODE
                    elif 'Publication Date:' in str(r('td')[0].getText()):
                        publication_date = r('td')[2].getText()  # in UNICODE
                    elif 'Keywords:' in str(r('td')[0].getText()):
                        keywords = r('td')[2].getText()  # in UNICODE
                #print "title:%s \nauthors:%s \npub:%s \ndate:%s \nkeywords:%s\n" % (title, authors, publication, publication_date, keywords)
                print i, src_id, bibcode, title[:60]
                abstracts_dict[bibcode] = {
                    'title': title,
                    'authors': authors,
                    'publication': publication,
                    'pub_date': publication_date,
                    'keywords': keywords,
                }
        if os.path.exists(abstracts_pkl_dirpath):
            os.system('rm ' + abstracts_pkl_dirpath)
        fp = open(abstracts_pkl_dirpath, 'wb')
        cPickle.dump(abstracts_dict, fp, 1)
        fp.close()
        import pdb
        pdb.set_trace()
        print

Пример #57

0

Показать файл

    def test_email_diff_subtitles(self):
        initial_count = len(mail.outbox)
        # set a user who can receive notification
        # make sure we have a different author, else he won't get notified
        author = User(username='******',
                      email='*****@*****.**',
                      notify_by_email=True,
                      valid_email=True)
        author.save(send_email_confirmation=False)
        # bypass logic from hell
        author.valid_email = True
        author.save()

        # this is needed for the non_editor template check
        user2 = User(username='******',
                     email='*****@*****.**',
                     notify_by_email=True,
                     valid_email=True)
        user2.save(send_email_confirmation=False)
        # bypass logic from hell
        user2.valid_email = True
        user2.save()
        # version is indentical to previous one
        video = Video.get_or_create_for_url(
            "http://wwww.example.com/video-diff.mp4")[0]
        video.followers.add(author)
        video.followers.add(user2)

        language = SubtitleLanguage(video=video, language_code='en')
        language.save()
        subs_data = [
            [0, 1000, '1'],
            [1000, 2000, '2'],
        ]

        subtitles_1 = SubtitleSet.from_list('en', subs_data)
        old_version = language.add_version(subtitles=subtitles_1,
                                           author=author)

        # now we change the text on the second sub
        subs_data[1][2] = '2 changed'
        # add a regular sub
        subs_data.append([2000, 3000, 'new sub'])
        # add an unsyced
        subs_data.append([None, None, 'no sync'])
        subtitles_2 = SubtitleSet.from_list('en', subs_data)
        new_version = language.add_version(subtitles=subtitles_2)
        self.assertTrue(len(video.notification_list()) > 0)

        res = send_new_version_notification(new_version.pk)
        self.assertNotEqual(res, None)
        # we expect two emails, one is the new-edits-non-editor, and
        # the other for mail_notification.html
        self.assertEqual(len(mail.outbox), initial_count + 2)
        for email_number, email_msg in enumerate(mail.outbox):
            # make sure this is the right message
            self.assertIn("New edits to ", email_msg.subject)
            self.assertIn("video-diff.mp4", email_msg.subject)
            html = BeautifulSoup(email_msg.body)
            html_text = "".join(html.body(text=True)).replace("\n", "")
            if email_number == 0:
                # assert text and timing changes are correct
                self.assertIn('67% of the text', html_text)
                self.assertIn('33% of the timing was changed.', html_text)
            # find the listed text changes to make sure they match
            diff_table = html.findAll('table', attrs={'class': 'diffs'})[0]
            old_version_changes = []
            new_version_changes = []
            for i, node in enumerate(diff_table.findAll('td')):
                if i % 2 == 0:
                    old_version_changes.append(node.text)
                else:
                    new_version_changes.append(node.text)
            self.assertEqual(old_version_changes, [u'2', u'', u''])
            self.assertEqual(new_version_changes, [
                u'2 changed',
                u'new sub',
                u'no sync',
            ])

Пример #58

0

Показать файл

Файл: super_bomb.py Проект: falconscript/sms_superbomber

br.select_form(nr=0)

# Input the info
br.form['npa'] = phonenum[:3]
br.form['nxx'] = phonenum[3:6]
br.form['thoublock'] = phonenum[6]

# Find Data
br.submit()

# Process Data
html = br.response().read()
soup = BeautifulSoup(html)

# Remove HTML Tags
text_parts = soup.findAll(text=True)
text = ' '.join(text_parts)

# Regex Applicable Data
prov_name = re.findall('[0-9]+ [0-9]+ \w+ \w+ \w+.+PROV', text)

# Convert List to String
L = [str(x) for x in prov_name]
s = string.join(L, ' ')

# Display Server Feedback
print "[ ] FoneFinder Server Feedback: " + s

# Provider Selection Menu
phoneHash = {
    "1": "Teleflip",

Пример #59

0

Показать файл

Файл: tameside-library-catalogue.py Проект: yuandra/scraperwiki-scraper-vault

    "&WebPageNr=1&WebAction=NewSearch"

html0 = scraperwiki.scrape(starting_url)
m = re.search("ListBody.csp\?[^\"]+", html0)
if m:
    list_url = baseurl + m.group(0)
else:
    import sys
    sys.exit(0)

html = scraperwiki.scrape(list_url)
#print html
soup = BeautifulSoup(html)

# use BeautifulSoup to get all <td> tags
tds = soup.findAll("td", attrs={"class": re.compile("listitem(Odd|Even)")})
oldurl = ""
for td in tds:
    if td.find("a"):
        url = unicode(td.find("a")["href"])
        if url == oldurl:
            continue
        oldurl = url
        #print baseurl + url
        html2 = scraperwiki.scrape(baseurl + url.replace(" ", "%20"))
        #print html2
        m = re.search("FullBBBody.csp\?[^\"]+", html2)
        if m:
            url2 = m.group(0)

            html3 = scraperwiki.scrape(baseurl + url2)

Пример #60

0

Показать файл

class HansardParser(object):
    """Base class for Hansard parsers"""
    def __init__(self, hansard, html):
        super(HansardParser, self).__init__()
        self.hansard = hansard
        for regex in STARTUP_RE:
            html = re.sub(regex[0], regex[1], html)

        self.soup = BeautifulSoup(html, convertEntities='html')

        # remove comments
        for t in self.soup.findAll(text=lambda x: isinstance(x, Comment)):
            t.extract()

    def parse(self):
        self.statements = []
        self.statement_index = 0

    def houseTime(self, number, ampm):
        ampm = ampm.replace('.', '')
        number = number.replace('.', ':')
        match = re.search(r'(\d+):(\d+)', number)
        if match:
            # "2:30 p.m."
            return datetime.datetime.strptime(
                "%s:%s %s" % (match.group(1), match.group(2), ampm),
                "%I:%M %p").time()
        else:
            # "2 p.m."
            return datetime.datetime.strptime("%s %s" % (number, ampm),
                                              "%I %p").time()

    def saveProceedingsStatement(self, text, t):
        text = parsetools.sane_quotes(parsetools.tameWhitespace(text.strip()))
        if len(text):
            timestamp = t['timestamp']
            if not isinstance(timestamp, datetime.datetime):
                # The older parser provides only datetime.time objects
                timestamp = datetime.datetime.combine(self.date, timestamp)
            statement = Statement(hansard=self.hansard,
                                  time=timestamp,
                                  text=text,
                                  sequence=self.statement_index,
                                  who='Proceedings')
            self.statement_index += 1
            self.statements.append(statement)

    def saveStatement(self, t):
        def mcUp(match):
            return 'Mc' + match.group(1).upper()

        if t['topic']:
            # Question No. 139-- -> Question No. 139
            t['topic'] = re.sub(r'\-+$', '', t['topic'])
            t['topic'] = re.sub(r"'S", "'s", t['topic'])
            t['topic'] = re.sub(r'Mc([a-z])', mcUp, t['topic'])
        if t.hasText():
            if not t['member_title']:
                t['member_title'] = 'Proceedings'
                print "WARNING: No title for %s" % t.getText().encode(
                    'ascii', 'replace')
            timestamp = t['timestamp']
            if not isinstance(timestamp, datetime.datetime):
                # The older parser provides only datetime.time objects
                timestamp = datetime.datetime.combine(self.date, timestamp)
            statement = Statement(hansard=self.hansard,
                                  heading=t['heading'],
                                  topic=t['topic'],
                                  time=timestamp,
                                  member=t['member'],
                                  politician=t['politician'],
                                  who=t['member_title'],
                                  text=t.getText(),
                                  sequence=self.statement_index,
                                  written_question=bool(t['written_question']))
            if r_notamember.search(t['member_title'])\
              and ('Speaker' in t['member_title'] or 'The Chair' in t['member_title']):
                statement.speaker = True
            self.statement_index += 1
            self.statements.append(statement)

            if ENABLE_PRINT:
                print u"HEADING: %s" % t['heading']
                print u"TOPIC: %s" % t['topic']
                print u"MEMBER TITLE: %s" % t['member_title']
                print u"MEMBER: %s" % t['member']
                print u"TIME: %s" % t['timestamp']
                print u"TEXT: %s" % t.getText()
            if ENABLE_READLINE:
                sys.stdin.readline()
        t.onward()

Python BeautifulSoup.findAll примеры использования