def whitespace(options): # clean events Event.objects.filter(source="whitespace").delete() soup = BeautifulSoup(urlopen("http://www.0x20.be/Main_Page").read()) for event in soup.ul('li'): if event.text == 'More...': continue title = event.a.text url = "http://www.0x20.be" + event.a["href"] if "-" in event.b.text[:-1]: start, end = map(lambda x: parse(x.strip()), event.b.text[:-1].split("-")) else: start = parse(event.b.text[:-1]) end = None location = event('a')[1].text Event.objects.create( title=title, source="whitespace", url=url, start=start, end=end, location=location.strip() if location else None ) if not options["quiet"]: print "Adding %s [%s] (%s)..." % (title.encode("Utf-8"), "whitespace", location.encode("Utf-8"))
def links(args): """ %prog links url Extract all the links "<a href=''>" from web page. """ p = OptionParser(links.__doc__) p.add_option("--img", default=False, action="store_true", help="Extract <img> tags [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) url, = args img = opts.img htmlfile = download(url) page = open(htmlfile).read() soup = BeautifulSoup(page) tag = 'img' if img else 'a' src = 'src' if img else 'href' aa = soup.findAll(tag) for a in aa: link = a.get(src) link = urljoin(url, link) print(link)
def getAvailabilityRank(table): try: #print "getting List of ATMs requires attention..." soup = BeautifulSoup(str(table)) rows = soup.findAll('tr') numRows = getRowsNumber(table) numRowsHead = getRowsHeadNumber(table) arrBestBranchBri = [] for a in range (2, numRows-1): trs = BeautifulSoup(str(rows[a])) tdcells = trs.findAll("td") percentAvailBri = float(tdcells[17].getText()) ukerName = cleanUpNamaUker(tdcells[0].getText()) if (percentAvailBri == 100.00): #arrBestBranch.append(ukerName+", "+jumlahATM) arrBestBranchBri.append(ukerName) except IndexError: arrBestBranchBri = getAvailabilityRank(table) return sorted(arrBestBranchBri)
def wolfplex(options): # clean events Event.objects.filter(source="wolfplex").delete() html_parser = HTMLParser() soup = BeautifulSoup(urlopen("http://www.wolfplex.org/wiki/Main_Page").read()) events = soup.find("div", id="accueil-agenda").dl for date_info, event in zip(events('dt'), events('dd')[1::2]): if event.span: event.span.clear() title = html_parser.unescape(event.text) base_domain = "http://www.wolfplex.org" if not event.a["href"].startswith("http") else "" url = (base_domain + event.a["href"]) if event.a else "http://www.wolfplex.org" start = parse(date_info.span["title"]) if "@" in title: title, location = title.split("@", 1) else: location = None Event.objects.create( title=title, source="wolfplex", url=url, start=start, location=location ) if not options["quiet"]: print "Adding %s [%s] (%s)..." % (title.encode("Utf-8"), "wolfplex", location.encode("Utf-8") if location else "")
def find_external_urls(self, gbobject): """Find external urls in an gbobject""" soup = BeautifulSoup(gbobject.html_content) external_urls = [a['href'] for a in soup.findAll('a') if self.is_external_url( a['href'], self.ressources.site_url)] return external_urls
def getRowsHeadNumber(table): # bagaimana cara menentukan berapa jumlah baris yang terpakai sebagai header? soup = BeautifulSoup(str(table)) rows = soup.findAll('tr') numRows = len(table.findAll(lambda tag: tag.name == 'tr' and tag.findParent('table') == table)) # inisialisasi variabel numRowsHead sebagai jumlah baris yang mengandung header numRowsHead = 0 # periksa satu per satu setiap baris for i in range (0, numRows): # apabila dalam suatu baris tertentu terdapat tag <th> if rows[i].findAll('th'): # maka numRows bertambah 1 numRowsHead = i + 1 # hasil akhir fungsi getTableDimension ini menghasilkan jumlah baris, jumlah baris yang terpakai header, jumlah kolom dan isi tabel itu sendiri return numRowsHead
def getLastPageNum(alamatURL): strHTML = fetchHTML(alamatURL) mysoup = BeautifulSoup(strHTML) arrURL = mysoup.findAll('tfoot')[0].findAll('tr')[0].findAll('a') maxPage = 0 if arrURL: for i in range (0, len(arrURL)): lastPageNum = int(arrURL[i].get('href').split('/')[7].split('?')[0]) if lastPageNum > maxPage: maxPage = lastPageNum lastPageNum = maxPage else: lastPageNum = 0 print "last page number is:", lastPageNum return int(lastPageNum)
def crawl(self, url, q): """ Crawls the main url looking for sub-urls. """ print 'calling crawl with url', url s = requests.Session() num_urls = 0 r = requests.get(url) soup = BeautifulSoup(r.text) trs = soup.findAll('tr') for tr in trs: tds = tr.findAll('td') if len(tds) == 6: title = tds[1].getText() link = tds[3].find('a')['href'] item = { 'main_page': title, } item['link'] = self.get_data_link(link, s) num_urls += self.crawl_again(item, q, s) print 'total urls crawled:', num_urls
def crawl_again(self, item, q, s): """ Crawls the content page, looking for all urls in the same domain. """ r = s.get(item['link']) soup = BeautifulSoup(r.text) main = soup.title.getText() urls = soup.findAll('a') chre = re.compile("(?<=chpt=)\d+") for url in urls: href = url['href'] isChapt = chre.search(href) if isChapt == None: mySub = "NoChap" else: mySub = isChapt.group(0) if href.startswith('/'): link = domain + href q.enq({ 'main_page': main, 'sub-page': mySub, 'section': url.parent.parent.getText().lstrip(), 'link': link }) return len(urls)
def getpresentationdetails(sender, **kwargs): print "Pre Save!" #print sender model = kwargs['instance'] # fetch the presentation url try: import urllib from BeautifulSoup import BeautifulSoup as BS html = urllib.urlopen(kwargs['instance'].url).read() bs = BS(html) # find the let's get the media url presurl = bs.find('link', rel='media:presentation') print "* Presentation: " + presurl['href'] # and the thumbnail thumburl = bs.find('link', rel='image_src') print "* Thumbnail: " + thumburl['href'] # and the author ame creator = bs.find('meta', property='dc:creator') print "* Creator: " + creator['content'] title = bs.find('meta', property="media:title") print "* Content: " + title['content'] except Exception, e: raise e
def get_daily_specials(day=None): page = urlopen(URL) soup = BeautifulSoup(page) page.close() daily_specials = { "name": "Dolcetto", "specials": [], "streetaddress": "Kyrkogatan 8, Sundsvall", "dataurl": URL, "mapurl": "http://www.hitta.se/ViewDetailsPink.aspx?Vkiid=4uG7%252fiYMOcHQKtp0VSkMNw%253d%253d&Vkid=3215131" } if day == None: day = date.today().weekday() # No lunch on Saturday or Sunday if day == 5 or day == 6: return daily_specials day = [u"måndag", u"tisdag", u"onsdag", u"torsdag", u"fredag"][day] anchor = soup.find(lambda t: t.name == "h2" and t.text == "Lunchmeny") menu = filter(lambda x: isinstance(x, NavigableString), anchor.findNextSibling("p")) for i, v in enumerate(menu): if day == v.lower(): daily_specials["specials"].append(menu[i+1]) break return daily_specials
def get(self, regno): #self.response.headers['Content-Type'] = 'text/html' br= _mechanize.Browser() cj = cookielib.CookieJar() br.set_cookiejar(cj) br.set_handle_equiv(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) n=262 while(n<=262): m=str(n).zfill(4) # filling zeros for roll no like 001,002 etc. n=n+1 #self.response.write('11BEC') # This is where roll no goes, for 09BCE just replace by 09BCE. #u=regno r=br.open('https://academics.vit.ac.in/parent/parent_login.asp') html=r.read() soup=BeautifulSoup(html) img = soup.find('img', id='imgCaptcha') image_response = br.open_novisit(img['src']) captcha = Captcha() #captcha.cookie = "123456788sids" #captcha.image = db.Blob(image_response.read()) captcha.regno = regno for cook in cj: captcha.cookie = cook.value captcha.cookiename = cook.name captcha.put() self.response.headers['Content-Type'] = 'image/jpeg' self.response.out.write(image_response.read())
def theme_worker(): def get_projects(doc): for result in doc.findAll(title=u"Project acronym"): a = result.a link = "http://cordis.europa.eu" + dict(a.attrs)['href'][2:] yield link logging.info('START THEME WORKER') while True: count = 0 theme = q.get() logging.info('THEME: %s', repr(theme)) url = THEME_URL % {'theme': theme} try: while True: r = requests.get(url, config=REQUESTS_CONFIG) if not r.ok: logging.error("Request failed for url: %s", url) continue doc = BeautifulSoup(r.content) for proj in get_projects(doc): project_queue.put((theme, proj)) count += 1 try: next_ = dict(doc.find( text="Next 20 projects »").parent.attrs )['href'][2:] except AttributeError: break url = "http://cordis.europa.eu" + next_ except Exception, e: logging.error("THEME_WORKER: Error for url: %s", url) logging.error(e) finally:
def selectForm(self, r): html = r.content linkget = r.url forms_filter = SoupStrainer('form'); soup = BeautifulSoup(html, parseOnlyThese=forms_filter); forms_post = ClientForm.ParseFile(StringIO.StringIO(soup.prettify()), linkget, backwards_compat=False); return forms_post
def main(): #for p in range(1,intGetMaxPage +1): #soup = BeautifulSoup() try: resp = urllib2.urlopen(getUrl,timeout=10) soup = BeautifulSoup(resp) soup = soup.find('div' ,{'id':'prodlist'}) #for k in soup.findAll("div", {'class': 'p-name'}): # 抓< div class='p=name'>...< /div> for k in soup.findAll('a', href=True): try: url = k.get('href') print k.text print url page_url = homeUrl + url print page_url resp_text_page = urllib2.urlopen(homeUrl + url, timeout=10) soup_text_page = BeautifulSoup(resp_text_page) contextPageUrl(soup_text_page,page_url) except: print "Unexpected error:", sys.exc_info()[0] print "Unexpected error:", sys.exc_info()[1] continue except: #continue print "Unexpected error:", sys.exc_info()[0] print "Unexpected error:", sys.exc_info()[1] pass
def scrape_and_look_for_next_link(url): html = scraperwiki.scrape(url) #print html root = lxml.html.fromstring(html) soup = BeautifulSoup(html) #using BeautifulSoup to find next page links scrape_table(root) #before carrying on scrape the hrefs using the scrape_table function #print soup items = soup.findAll('a',title="Next page") # findAll "next page" links if items: # if there is a next page link continue next_link = root.cssselect("div.srch-Page.srch-Page-bg a") #print next_link if next_link: next_link2 = next_link[2].attrib['href'] #print next_link2 split_link = re.split("\)+",next_link2) split_link2 = re.split("\=+",split_link[0]) split_link3 = re.split("\'+",split_link2[2]) #print split_link3[0] #print split_link2 #if split_link ==11: next_url = nextlink_url+split_link3[0] if next_url: print next_url scrape_and_look_for_next_link(next_url)
def getsubhyperlink(origin_url, html_content, reslist, temp_set): soup = BeautifulSoup(html_content, parseOnlyThese=SoupStrainer('a')) hyperlink = soup.findAll('a',href=True) for tag in hyperlink: if "https" in tag['href'] or "http" in tag['href']: if tag['href'] not in temp_set: if origin_url in tag['href']: reslist.append(tag['href']) temp_set.append(tag['href']) else: if "www" in tag['href']: temp_url = "http://"+tag['href'] if temp_url not in temp_set: if origin_url in temp_url: reslist.append(temp_url) temp_set.append(temp_url) else: if tag['href'] and tag['href'][0] == '/': temp_url = origin_url + tag['href'] if temp_url not in temp_set: reslist.append(temp_url) temp_set.append(temp_url) else: temp_url = origin_url + tag['href'] if temp_url not in temp_set: reslist.append(temp_url) temp_set.append(temp_url)
def get_syllables(word): url = 'http://www.wordcalc.com/index.php' post_data = urllib.urlencode( {'text': word}) post_data = '%s&optionSyllableCount&optionWordCount' % post_data cnxn = urllib.urlopen(url, post_data) response = cnxn.read() cnxn.close() soup = BeautifulSoup(response) h3_matches = [h3 for h3 in soup.findAll('h3') if h3.text == 'Statistics'] if len(h3_matches) != 1: raise Exception('Wrong number of <h3>Statistics</h3>') h3_match = h3_matches[0] table = h3_match.findNextSibling('table') td_matches = [td for td in table.findAll('td') if td.text == 'Syllable Count'] if len(td_matches) != 1: raise Exception('Wrong number of <td>Syllable Count</td>') td_match = td_matches[0] td_value = td_match.findNextSibling('td') syllable_count = int(td_value.text) return syllable_count
def setUp(self): "Setting common information" try: from BeautifulSoup import BeautifulSoup, SoupStrainer except ImportError: self.indices = None return # Load the file as a tree, but only take the SST table (border=1) from urllib import urlopen url = "http://www.cpc.noaa.gov/products/analysis_monitoring/"\ "ensostuff/ensoyears.shtml" url = urlopen(url) table = BeautifulSoup(url.read(), parseOnlyThese=SoupStrainer("table", border=1)) # Separate it by rows, but skip the first one (the header) years = [] indices = [] color = dict(red=+1, white=0, blue=-1) deft = [(None,'color:white')] for row in table.findAll("tr")[1:]: cols = row.findAll('td') years.append(int(cols.pop(0).strong.string)) indices.append([color[getattr(_.span, 'attrs', deft)[0][-1].split(':')[-1]] for _ in cols]) start_date = ts.Date('M', year=years[0], month=1) self.indices = time_series(np.array(indices).ravel(), start_date=start_date)
def getMovieData(self): list = [] #-- get serial play list & parameters ------------------------------------- html = self.Auth.get_HTML(self.serial_url, None, 'http://serialu.net/media/uppod.swf') # -- parsing web page html = re.compile('<body>(.+?)<\/body>', re.MULTILINE|re.DOTALL).findall(html)[0] soup = BeautifulSoup(html) pl_url = '' is_multiseason = len(soup.findAll('object', {'type':'application/x-shockwave-flash'})) for rec in soup.findAll('object', {'type':'application/x-shockwave-flash'}): if is_multiseason > 1: season = rec.parent.previousSibling.previousSibling.text+r' ' else: season = r'' for par in rec.find('param', {'name':'flashvars'})['value'].split('&'): if par.split('=')[0] == 'pl': pl_url = par[3:] if pl_url.find('http:') == -1: pl_url = xppod.Decode(pl_url) #-- get playlist details --------------------------------------------------- html = self.Auth.get_HTML(pl_url, None, 'http://serialu.net/media/uppod.swf') self.pl_url = pl_url # -- check if playlist is encoded if html.find('{"playlist":[') == -1: html = xppod.Decode(html).encode('utf-8').split(' or ')[0] #-- TODO: make smart choice # -- parsing web page s_url = '' s_num = 0 movie_list = [] for rec in re.compile('{(.+?)}', re.MULTILINE|re.DOTALL).findall(html.replace('{"playlist":[', '')): for par in rec.replace('"','').split(','): if par.split(':')[0]== 'comment': name = str(s_num+1) + ' серия' #par.split(':')[1]+' ' if par.split(':')[0]== 'file': if 'http' in par.split(':')[1]: s_url = par.split(':')[1]+':'+par.split(':')[2] else: s_url = xppod.Decode(par.split(':')[1]).split(' or ')[0] s_num += 1 # mark part for history name = season.encode('utf-8') + name movie_list.append({'movie_name': name, 'url': s_url}) #if h_part <> '-': # if name == h_part: # name = '[COLOR FF00FF00]'+name+'[/COLOR]' #-- parse data list.append({'name':self.serial_name, 'img': self.serial_img, 'descr': self.serial_descr, 'season_number':s_num, 'name_orig':'', 'movie': movie_list}) #-- return movie list return list
def extract_title(url): page = open(page_loc(url)) soup = BeautifulSoup(page.read()) title = soup.find('title') title = title.string.encode('utf-8') gadgets.string_to_file(title, title_loc(url)) page.close()
def parseLyrics(lyricList,outlist,s,e): baseURL = u'http://www.darklyrics.com' i = 0 ; for key in lyricList : i = i + 1 ; if(i >= s and i<= e): #key = 'In Flames' # REMOVE FOR 100 Bands time.sleep(1) turl = lyricList[key] ; print 'Looking up band ' + key #print turl opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] page = opener.open(turl) soup = BeautifulSoup(page.read()) divs = soup.findChildren('div',attrs={"class" : "album"}) #get the sub-URL to the lyrics of the latest album and then full URL to the lyrics source if(len(divs)>0): sub_url = divs[len(divs)-1].findChildren('a')[0]['href'] lurl = baseURL + sub_url.split('#')[0][2:] #print lurl # hit the URL and get data page = opener.open(lurl) soup = BeautifulSoup(page.read()) lydiv = soup.findChildren('div',attrs={"class" : "lyrics"})[0] [x.extract() for x in lydiv('div')] #lyrictext = re.sub('\'lydiv.text ; rly = getRawLyrics(lydiv) else: rly = "Manual" print rly outlist[key] = rly #break ; # remove once started full testing print 'done' , s, ' to ', e return outlist
def get_favicon_url(url): if not url.startswith('http'): url = "http://{0}".format(url) # Check if the root location has a favicon before parsing for it if _has_root_favicon(url): return urlparse.urljoin(url, 'favicon.ico') headers = {'User-Agent': 'Mozilla/5.0'} request = urllib2.Request(url, None, headers) website = urllib2.urlopen(request).read() soup = BeautifulSoup(website) favicon_element = soup.find("link", rel="shortcut icon") if favicon_element: hostname = urlparse.urlparse(url).hostname favicon_url = favicon_element['href'] if favicon_url.startswith('//cdn'): return "http:" + favicon_url # favicon url is relative and must be converted to absolute path elif hostname not in favicon_url: return urlparse.urljoin(url, favicon_url) else: return favicon_url else: return None
def get_epfile(url): """ Return the file (mp3) URL to be read from the website to play the selected reloaded episode. Input the webpage URL of the episode to be played. E.g.: http://www.deejay.it/audio/20130526-4/269989/ Output the URL of the mp3 (rarely a wma) file to be played to listen to the selected episode. E.g.: http://flv.kataweb.it/deejay/audio/dee_giallo/deegiallolosmemoratodicollegno.mp3 Returns an empty string if the file cannot be found. """ soup = BeautifulSoup(urllib2.urlopen(url)) fileurl = soup.find('div', {'id': 'playerCont'}) if not fileurl: return '' else: hit = re.findall("file=(.*.mp3)&", fileurl.iframe['src']) if not hit: return '' else: return hit[0]
def start(self): with QMutexLocker(self.mutex): self.stoped = False #for i in range(self.start_p,self.end_p): for i in range(1,3): while self.suspended: self.wait() return if self.stoped: return url ="http://www.99fang.com/service/agency/a1/?p=%d" % i print url try: r = urllib2.urlopen(url).read() soup = BeautifulSoup(r) box = soup.find("div",{'class':'agency-call-box'}) lis = box("li") for li in lis: tel = li.a.string print tel r =urllib2.urlopen("http://suzhou.jjr360.com/app.php?c=spider&a=index&city=&tel=%s" % tel) print r.read() except: pass else: #self.emit(SIGNAL("updateTime()")) time.sleep(1)
def _on_login(self, page): soup = BeautifulSoup(page) if soup.find('a', text='Log in'): raise LoginError(page) self._browser.save_cookies() return soup
def fetch_page(link_id): link = Link.objects.get(pk=link_id) url = link.url headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'} req = urllib2.Request(url, None, headers) try: html = urllib2.urlopen(req).read() soup = BeautifulSoup(html) link.title = soup.find('title').text favicon = soup.find('link', rel='shortcut icon') if favicon and favicon['href']: link.favicon = urljoin(url, favicon['href']) for item in soup.findAll('meta'): if item.get('name', '').lower() in ('description', 'og:description') and item.get('content', ''): link.description = item.get('content', '') except Exception as e: link.is_error = 1 link.error_text = e.reason.__str__() link.save()
def split_contents(self): """ Iterates over the elements in the block """ if self.split_content: return self.split_content split = self.soup.findAll({'link' : True, 'style' : True}) for elem in split: if elem.name == 'link' and elem['rel'] == 'stylesheet': filename = self.get_filename(elem['href']) path, ext = os.path.splitext(filename) if ext in settings.COMPILER_FORMATS.keys(): if self.recompile(filename): self.compile(path,settings.COMPILER_FORMATS[ext]) basename = os.path.splitext(os.path.basename(filename))[0] elem = BeautifulSoup(re.sub(basename+ext,basename+'.css',unicode(elem))) filename = path + '.css' try: self.split_content.append(('file', filename, elem)) except UncompressableFileError: if django_settings.DEBUG: raise if elem.name == 'style': data = elem.string elem_type = elem.get('type', '').lower() if elem_type and elem_type != "text/css": # it has to be preprocessed if '/' in elem_type: # we accept 'text/ccss' and plain 'ccss' too elem_type = elem_type.split('/')[1] # TODO: that dot-adding compatibility stuff looks strange. # do we really need a dot in COMPILER_FORMATS keys? ext = '.'+elem_type data = self.compile_inline(data,ext) elem = ''.join(("<style type='text/css'>\n",data,"\n</style>")) self.split_content.append(('hunk', data, elem)) return self.split_content
def _on_page(self, page): if not page: import ipdb ipdb.set_trace() soup = BeautifulSoup(page) if not soup.find('a', text='Log in'): event = soup.find('b', text='Something has happened!') if event: cell = event.findParent('table').findAll('td')[2] text = ''.join([x.text if hasattr(x, 'text') else x for x in cell.childGenerator()]) self._logger.info("Something has happned: %s", text) try: self._neopoints = get_np(soup) except NoNpInPage: pass return soup self._logger.info('Need to login. Using account %s', self._username) data = dict(username=self._username, password=self._password, destination=soup.find( 'input', attrs=dict(name='destination'))['value']) d = self._browser.post('http://www.neopets.com/login.phtml', data) d.addCallback(self._on_login) return d
def removecut(string): soup = BeautifulSoup(string, selfClosingTags=['img','br']) tag = soup.find('yvcut') if not tag: return string tag.extract() string = soup.renderContents() return string
def test_diffing(self): create_langs_and_versions(self.video, ['en']) eng = self.video.newsubtitlelanguage_set.get(language_code='en') subtitles = SubtitleSet.from_list('en', [ (10000, 20000, "1 - :D"), (20000, 30000, "2 - :D"), (30000, 40000, "3 - :D"), (40000, 50000, "4 - :D"), (50000, 60000, "5 - :D"), ]) subtitles2 = SubtitleSet.from_list( 'en', [ (10000, 20000, "1 - :D"), (20000, 25000, "2 - :D"), # time change, (30000, 40000, "Three - :D"), # text change, # multiple lines replaced by a single line (40000, 60000, "45 - :D"), ]) first_version = eng.add_version(subtitles=subtitles) second_version = eng.add_version(subtitles=subtitles2) # Note on the argument order to diff: we always diff the more recent # version against the less recent diff_result = diff(subtitles2, subtitles) response = self._simple_test('videos:diffing', [first_version.id, second_version.id]) self.assertEquals(diff_result, response.context['diff_data']) diff_sub_data = diff_result['subtitle_data'] html = BeautifulSoup(response.content) diff_list = html.find('ol', {"class": 'subtitles-diff'}) diff_items = diff_list.findAll('li') # check number of lines self.assertEquals(len(diff_items), len(diff_sub_data)) def check_column_data(column, sub_data): """Check the data in the HTML for a column against the data in from diff() """ # special check for empty lines if sub_data.text is None: self.assertEquals(column.string.strip(), "") return time_span, text_span = column.findAll('span', recursive=False) self.assertEquals(text_span.string.strip(), sub_data.text) time_child_spans = time_span.findAll('span', {'class': 'stamp_text'}) self.assertEquals(time_child_spans[0].string.strip(), format_sub_time(sub_data.start_time)) self.assertEquals(time_child_spans[1].string.strip(), format_sub_time(sub_data.end_time)) for li, diff_sub_data_item in zip(diff_items, diff_sub_data): # Intuitively, left_column should be compared against # ['subtitles'][0], but we do the opposite. This is because of # the way things are ordered: # - diff() was passed (older_version, newer_version) # - The rendered HTML has the newer version on the left and the # older version on the right check_column_data(li.find('div', {'class': 'left_column'}), diff_sub_data_item['subtitles'][1]) check_column_data(li.find('div', {'class': 'right_column'}), diff_sub_data_item['subtitles'][0]) # we use the time_change class for either text or time changes. time_changes = li.findAll('span', {'class': 'time_change'}) if (diff_sub_data_item['time_changed'] or diff_sub_data_item['text_changed']): self.assertNotEqual(len(time_changes), 0) else: self.assertEquals(len(time_changes), 0)
def mobileUA(content): soup = BeautifulSoup(content, convertEntities=BeautifulSoup.HTML_ENTITIES) res = soup.find('html') res = res.get('class', '') if res else '' return True if 'a-mobile' in res or 'a-tablet' in res else False
def _parseHTML(br): response = br.response().read().decode('utf-8') response = re.sub(r'(?i)(<!doctype \w+).*>', r'\1>', response) soup = BeautifulSoup(response, convertEntities=BeautifulSoup.HTML_ENTITIES) return response, soup
import urllib import datetime from BeautifulSoup import BeautifulSoup sock=urllib.urlopen("http://www.timesnow.tv/") htmlSrc=sock.read() soup=BeautifulSoup(htmlSrc) print "The Times now\n" for a in soup.findAll('class', attrs={'href':'[a-zA-Z,0-9;:,]*.cms'}): print a.text
record['Obama'] = \ table_td[1].text record['Clinton'] = \ table_td[2].text record['Reagan'] = \ table_td[3].text print record, print "-" * 10 #Save data step by step scraperwiki.sqlite.save(["Word"], record) #website link Website = 'http://www.guardian.co.uk/news/datablog/2011/may/25/us-presidents-adressing-parliament-obama-clinton-reagan-speech-word-count' html = scraperwiki.scrape(Website) soup = BeautifulSoup(html) scrape_table(soup) ## GURPREET SINGH import scraperwiki from BeautifulSoup import BeautifulSoup def scrape_table(soup): #to define coloumns name used in table scraperwiki.sqlite.save('data_columns', ['Word', 'Obama', 'Clinton', 'Reagan']) table = soup.find("table", {"class": "in-article sortable"}) #To each row of table is selected rows = table.findAll("tr") for row in rows: record = {}
def Soup_check(html): soup = BeautifulSoup(html) if html == " <script language='javascript' type='text/javascript'>window.location.href = 'http://sms.fullonsms.com/action_main.php';</script>": return True confirmation160 = soup.find('div', attrs={"class": "h-sta"}) if confirmation160: print "+++++++++++++++ Service Response +++++++++++++++++" print "+|", print confirmation160.find('h2').\ findAll(text=True)[0].strip().replace('\r', '') print "++++++++++++++++++++++++++++++++++++++++++++++++++" w2s_Confirmation = soup.find('div', attrs={"class": "confirm"}) if w2s_Confirmation: print "+++++++++++++++ Service Response +++++++++++++++++" print "+|", w2s_Confirmation.find('h2').findAll(text=True)[0] print "++++++++++++++++++++++++++++++++++++++++++++++++++" w2sms_mobile_no = soup.find('div', attrs={"class": "mobile-in"}) if w2sms_mobile_no: print "+++++++++++++ Way2Sms Login Detail +++++++++++++++" name = soup.find('span', attrs={"onmouseover": "dismouout();"}) print "+| Name:", name.findAll(text=True)[0] Text_list = w2sms_mobile_no.findAll(text=True) cut = ['\t', '\n', '\r', ' ', '.'] for text in Text_list[:]: i = Text_list.index(text) for s in cut: text = text.replace(s, '') Text_list[i] = text if not text: Text_list.remove(text) print "+|", ': '.join(Text_list) email = str(soup.find('input', attrs={"id": "logemail"})) print "+| Email:", print email[email.index('value=') + 7:email.index('>') - 3] ips = soup.find('div', attrs={"class": "item1 flt ip"}) Text_list = ips.findAll(text=True) cut = [' ', '\n', ' '] for text in Text_list[:]: i = Text_list.index(text) for s in cut: text = text.replace(s, '') Text_list[i] = text if not text: Text_list.remove(text) for i in range(0, len(Text_list), 2): print "+|", Text_list[i], print Text_list[i + 1] if i + 1 < len(Text_list) else '' return True acc_details = soup.find('div', attrs={"class": "mad"}) if acc_details: print "++++++++++++++ 160by2 Login Detail +++++++++++++++" Text_list = acc_details.findAll(text=True) rem = [u'Change Password', u'(Change)', u'\n'] cut = [ ' ', ] for text in Text_list[:]: if [x for x in rem if x in text]: Text_list.remove(text) else: i = Text_list.index(text) for s in cut: text = text.replace(s, '') Text_list[i] = text print "$|", Text_list[0] for i in range(1, len(Text_list), 3): print "+| %s%s %s" % ( Text_list[i], Text_list[i + 1] if i + 1 < len(Text_list) else '', Text_list[i + 2] if i + 2 < len(Text_list) else '') last_login = soup.find('div', attrs={"class": "lh"}) Text_list = last_login.findAll(text=True) rem = [u'\n', u'about', u'view', u'button'] for text in Text_list[:]: if [x for x in rem if x in text]: Text_list.remove(text) else: i = Text_list.index(text) for s in cut: text = text.replace(s, '') Text_list[i] = text print "$|", Text_list[0] for i in range(1, len(Text_list), 3): print "+| %s%s %s" % ( Text_list[i], Text_list[i + 1] if i + 1 < len(Text_list) else '', Text_list[i + 2] if i + 2 < len(Text_list) else '') return True return False
def parse(self, of=None, req=None, limit=CFG_EXTERNAL_COLLECTION_MAXRESULTS): """Parse buffer to extract records.""" if CFG_BEAUTIFULSOUP_INSTALLED: soup = BeautifulSoup(self.buffer) # Remove "more" links that include Indico Javascript more_links = soup.findAll('a', { "class": "searchResultLink", "href": "#" }) [more_link.extract() for more_link in more_links] # Events event_results = soup.findAll('li', {"class": "searchResultEvent"}) event_index = 1 for result in event_results: self.add_html_result((event_index == 1 and '<b>Events:</b><br/>' or '') + \ str(result) + '<br />', limit) event_index += 1 # Contributions contribution_results = soup.findAll( 'li', {"class": "searchResultContribution"}) contribution_index = 1 for result in contribution_results: self.add_html_result((contribution_index == 1 and '<b>Contributions:</b><br/>' or '') + \ str(result) + '<br />', limit) contribution_index += 1 else: # Markup is complex. Do whatever we can... # Events split_around_events = self.buffer.split( '<li class="searchResultEvent">') if len(split_around_events) > 1: event_index = 1 for html_chunk in split_around_events[1:]: output = '<li class="searchResultEvent">' if event_index == len(split_around_events) - 1: split_around_link = html_chunk.split( 'searchResultLink') split_around_ul = 'searchResultLink'.join( split_around_link[1:]).split('</ul>') output += split_around_link[0] + 'searchResultLink' + \ split_around_ul[0] + '</ul>' + split_around_ul[1] else: output += html_chunk self.add_html_result((event_index == 1 and '<b>Events:</b><br/>' or '') + \ output + '<br />', limit) event_index += 1 # Contributions split_around_contributions = self.buffer.split( '<li class="searchResultContribution">') if len(split_around_contributions) > 1: contribution_index = 1 for html_chunk in split_around_contributions[1:]: output = '<li class="searchResultContribution">' if contribution_index == len( split_around_contributions) - 1: split_around_link = html_chunk.split( 'searchResultLink') split_around_ul = 'searchResultLink'.join( split_around_link[1:]).split('</ul>') output += split_around_link[0] + 'searchResultLink' + \ split_around_ul[0] + '</ul>' + split_around_ul[1] else: output += html_chunk self.add_html_result((contribution_index == 1 and '<b>Contributions:</b><br/>' or '') + \ output + '<br />', limit) contribution_index += 1
def _get_newblogpost_dom(self): response = self._get_newblogpost() return BeautifulSoup(response.html())
def Movie_List(params): #-- get filter parameters par = Get_Parameters(params) # show search dialog if par.search == 'Y': skbd = xbmc.Keyboard() skbd.setHeading('Поиск сериалов.') skbd.doModal() if skbd.isConfirmed(): SearchStr = skbd.getText().split(':') url = 'http://seasonvar.ru/autocomplete.php?query=' + urllib.quote( SearchStr[0]) par.search = SearchStr[0] else: return False else: url = 'http://seasonvar.ru/index.php?onlyjanrnew=' + par.genre + '&&sortto=name&country=' + par.country + '&nocache=' + str( random.random()) #== get movie list ===================================================== html = get_HTML(url) # -- parsing web page -------------------------------------------------- count = 1 list = [] if par.search != '': #-- parsing search page s = json.loads(html) count = len(s['suggestions']) if count < 1: return False for i in range(0, count): name = s['suggestions'][i].encode('utf-8') list.append({ 'title': name, 'url': 'http://seasonvar.ru/' + s['data'][i], 'img': icon }) else: #-- parsing serial list soup = BeautifulSoup(html, fromEncoding="utf-8") # -- get number of serials mtag = GetTag(soup) #with open('d:\\seasonvar.html', 'a') as the_file: # the_file.write(html) if par.alphabet == '': count = 0 for rec in soup.findAll('div', {'class': 'alf-letter'}): a_name = u'[COLOR FF00FFF0][B]' + rec.text + u'[/B][/COLOR] сериалов: ' + str( len(rec.parent.findAll('div', {'class': mtag}))) list.append({ 'title': a_name.encode('utf-8'), 'alphabet': rec.text.encode('utf-8') }) count = count + len(rec.parent.findAll('div', {'class': mtag})) else: for reca in soup.findAll('div', {'class': 'alf-letter'}): if reca.text.encode('utf-8') == par.alphabet: for rec in reca.parent.findAll('div', {'class': mtag}): list.append({ 'url': 'http://seasonvar.ru' + rec.find('a')['href'].encode('utf-8'), 'title': rec.find('a').text.encode('utf-8'), 'img': 'http://cdn.seasonvar.ru/oblojka/' + rec['id'].replace('div', '') + '.jpg' }) count = len(list) #-- add header info Get_Header(par, count) #-- get movie info #try: if par.alphabet != '' or par.search != '': for rec in list: i = xbmcgui.ListItem(rec['title'], iconImage=rec['img'], thumbnailImage=rec['img']) u = sys.argv[0] + '?mode=SERIAL' u += '&name=%s' % urllib.quote_plus(rec['title']) u += '&title=%s' % urllib.quote_plus(rec['title']) u += '&url=%s' % urllib.quote_plus(rec['url']) u += '&genre=%s' % urllib.quote_plus(par.genre) u += '&genre_name=%s' % urllib.quote_plus(par.genre_name) u += '&country=%s' % urllib.quote_plus(par.country) u += '&country_name=%s' % urllib.quote_plus(par.country_name) xbmcplugin.addDirectoryItem(h, u, i, True) else: for rec in list: i = xbmcgui.ListItem(rec['title'], iconImage=icon, thumbnailImage=icon) u = sys.argv[0] + '?mode=MOVIE' #u += '&name=%s'%urllib.quote_plus(rec['title']) #u += '&title=%s'%urllib.quote_plus(rec['title']) u += '&alphabet=%s' % urllib.quote_plus(rec['alphabet']) u += '&genre=%s' % urllib.quote_plus(par.genre) u += '&genre_name=%s' % urllib.quote_plus(par.genre_name) u += '&country=%s' % urllib.quote_plus(par.country) u += '&country_name=%s' % urllib.quote_plus(par.country_name) xbmcplugin.addDirectoryItem(h, u, i, True) #except: # pass xbmcplugin.endOfDirectory(h)
def getTableDimension(arrTable): # fungsi ini untuk mendapatkan dimensi tabel dan isinya berdasarkan data sream string HTML # fungsi ini melanjutkan dari fungsi fetchHTML() # dimensi tabel dimaksud adalah jumlah baris dan jumlah kolom pada tabel # inisialisasi variabel 'largest_table' dan 'max_rows' # bagaimana cara menentukan tabel mana yang berisi data? # pilihlah tabel yang terbesar yang memiliki jumlah baris terbanyak largest_table = None max_rows = 0 for table in arrTable: # cek satu per satu jumlah baris yang ada pada masing-masing tabel dalam array kumpulan tabel # simpan dalam variabel bernama numRows numRows = len(table.findAll(lambda tag: tag.name == 'tr' and tag.findParent('table') == table)) # jika jumlah baris pada suatu tabel lebih besar daripada '0' maka jadikan sebagai max_rows sementara # proses ini diulangi terus menerus maka max_rows akan berisi jumlah baris terbanyak if numRows > max_rows: largest_table = table max_rows = numRows # ini hanya mengembalikan penyebutan 'tabel terbesar' hanya sebagai 'tabel' # dan mengembalikan penyebutan 'jumlah baris terbanyak' hanya sebagai 'jumlah baris' table = largest_table numRows = max_rows # bagaimana cara menentukan berapa jumlah kolomnya? numCols = len(table.contents[1]) # bagaimana cara menentukan berapa jumlah baris yang terpakai sebagai header? soup = BeautifulSoup(str(table)) rows = soup.findAll('tr') # inisialisasi variabel numRowsHead sebagai jumlah baris yang mengandung header numRowsHead = 0 # periksa satu per satu setiap baris for i in range (0, numRows): # apabila dalam suatu baris tertentu terdapat tag <th> if rows[i].findAll('th'): # maka numRows bertambah 1 numRowsHead = i + 1 # hasil akhir fungsi getTableDimension ini menghasilkan jumlah baris, jumlah baris yang terpakai header, jumlah kolom dan isi tabel itu sendiri return numRows, numRowsHead, numCols, table
from mechanize import Browser from BeautifulSoup import BeautifulSoup import scraperwiki from scraperwiki import sqlite mech = Browser() url = 'http://www.gpupdate.net/en/standings/190/2013-motogp-standings/' page = mech.open(url) html = page.read() soup = BeautifulSoup(html) resContainer = soup.find("div", {"id": "middle_container"}) rownumber = 0 table = soup.find("table") for row in table.findAll('tr')[1:30]: col = row.findAll('td') pos = int(col[0].string.replace(".", "")) driver = col[1].a.string tempTD = col[1] team = tempTD.findAll('span') team = team[1].string points = col[2].string country = tempTD.findAll('img') country = country[0]['alt'].upper()
def Serial_Info(params): #-- checkif SWD decompiler set up properly if not Check_SWF(): return False #-- get filter parameters par = Get_Parameters(params) #== get serial details ================================================= tvshowtitle = par.title full_name = par.name url = par.url html = get_HTML(url) # -- parsing web page -------------------------------------------------- soup = BeautifulSoup(html, fromEncoding="windows-1251") # -- check if serial has seasons and provide season list if par.is_season == '' and len( soup.findAll('div', {'class': 'full-news-2-content'})) > 0: #-- generate list of seasons for rec in soup.find('div', { 'class': 'full-news-2-content' }).findAll('a'): s_url = ('http://seasonvar.ru' + rec['href']).encode('utf-8') s_name = rec.text.replace('>>>', '').replace(u'Сериал ', '') if s_name.find(u'сезон(') > -1: s_name = s_name.split(u'сезон(')[0] + u'сезон' s_name = s_name.encode('utf-8') s_id = rec['href'].split('-')[1] s_image = 'http://cdn.seasonvar.ru/oblojka/' + s_id + '.jpg' i = xbmcgui.ListItem(s_name, iconImage=s_image, thumbnailImage=s_image) u = sys.argv[0] + '?mode=SERIAL' #-- filter parameters u += '&name=%s' % urllib.quote_plus(s_name) u += '&title=%s' % urllib.quote_plus(tvshowtitle) u += '&url=%s' % urllib.quote_plus(s_url) u += '&genre=%s' % urllib.quote_plus(par.genre) u += '&genre_name=%s' % urllib.quote_plus(par.genre_name) u += '&country=%s' % urllib.quote_plus(par.country) u += '&country_name=%s' % urllib.quote_plus(par.country_name) u += '&is_season=%s' % urllib.quote_plus('*') xbmcplugin.addDirectoryItem(h, u, i, True) else: #-- generate list of movie parts # -- get movie info for rec in soup.find('td', {'class': 'td-for-content'}).findAll('p'): if len(rec.findAll('span', {'class': 'videl'})) > 0: for j in str(rec).split('<br />'): r = re.compile( '<span class="videl">(.+?)<\/span>(.+?)<\/br>', re.MULTILINE | re.DOTALL).findall(str(j) + '</br>') for s in r: if s[0] == 'Жанр:': mi.genre = s[1].replace('</p>', '') if s[0] == 'Страна:': mi.country = s[1].replace('</p>', '') if s[0] == 'Вышел:': mi.year = s[1].replace('</p>', '') if s[0] == 'Режисёр:': mi.director = s[1].replace('</p>', '') if s[0] == 'Роли:': mi.actors = s[1].replace('</p>', '') else: mi.text = rec.text.encode('utf-8') mi.actors = mi.actors.split(',') mi.img = soup.find('td', { 'class': 'td-for-content' }).find('img')['src'] # -- get serial parts info # -- mane of season i = xbmcgui.ListItem('[COLOR FFFFF000]' + par.name + '[/COLOR]', path='', thumbnailImage=icon) u = sys.argv[0] + '?mode=EMPTY' xbmcplugin.addDirectoryItem(h, u, i, False) pname = par.name # -- get list of season parts s_url = '' s_num = 0 #--------------------------- try: playlist, playlist_url, swf_player = Get_PlayList(soup, url) except: Initialize() playlist, playlist_url, swf_player = Get_PlayList(soup, url) if playlist == '': return False for rec in playlist: name = rec['name'] s_url = rec['video'] i = xbmcgui.ListItem(name, path=urllib.unquote(s_url), thumbnailImage=mi.img) # iconImage=mi.img u = sys.argv[0] + '?mode=PLAY' u += '&url=%s' % urllib.quote_plus(s_url) u += '&name=%s' % urllib.quote_plus(pname) u += '&full_name=%s' % urllib.quote_plus(full_name) u += '&title=%s' % urllib.quote_plus(tvshowtitle) u += '&img=%s' % urllib.quote_plus(mi.img) u += '&playlist=%s' % urllib.quote_plus(playlist_url) try: cast = re.compile(">(.+?)</a>").findall(mi.actors) except: cast = [] i.setInfo(type='video', infoLabels={ 'title': name, 'cast': cast, 'artist': mi.actors, 'year': int(mi.year), 'director': mi.director, 'plot': mi.text, 'genre': mi.genre }) i.setProperty('fanart_image', mi.img) #i.setProperty('IsPlayable', 'true') xbmcplugin.addDirectoryItem(h, u, i, False) xbmcplugin.endOfDirectory(h)
def parse_hotellist_page(html, page_count): """Parses the website with the hotel list and prints the hotel name, the number of stars and the number of reviews it has. If there is a next page in the hotel list, it returns a list to that page. Otherwise, it exits the script. Corresponds to STEP 4 of the slides. Parameters ---------- html : str The HTML of the website with the hotel list. Returns ------- URL : str If there is a next page, return a relative link to this page. Otherwise, exit the script. """ soup = BeautifulSoup(html) # Extract hotel name, star rating and number of reviews hotel_boxes = soup.findAll( 'div', {'class': 'listing wrap reasoning_v5_wrap jfy_listing p13n_imperfect'}) if not hotel_boxes: log.info( "#################################### Option 2 ######################################" ) hotel_boxes = soup.findAll('div', {'class': 'listing_info jfy'}) if not hotel_boxes: log.info( "#################################### Option 3 ######################################" ) hotel_boxes = soup.findAll( 'div', {'class': 'listing easyClear p13n_imperfect'}) data = [] for hotel_box in hotel_boxes: hotel_name = hotel_box.find("a", {"target": "_blank"}).find(text=True) log.info("Hotel name: %s" % hotel_name.strip()) stars = hotel_box.find("img", {"class": "sprite-ratings"}) if stars: star = stars['alt'].split()[0] log.info("Stars: %s" % star) num_reviews = hotel_box.find("span", { 'class': "more" }).findAll(text=True) if num_reviews: num_reviews1 = [x for x in num_reviews if "review" in x][0].strip() log.info("Number of reviews: %s " % num_reviews1) link = hotel_box.find('a', {'class': "property_title"}) url = base_url + link['href'] # Sleep 2 sec before starting a new http request time.sleep(2) # Request page headers = {'User-Agent': user_agent} response = requests.get(url, headers=headers) new_html = response.text.encode('utf-8') row = helper(new_html) row.insert(0, float(num_reviews1.strip("reviews").replace(",", ""))) row.insert(0, float(star)) data.append(row) with open("hotels.csv", "a+") as file: csv.writer(file).writerows(data) # Get next URL page if exists, otherwise exit div = soup.find("div", {"id": "pager_bottom"}) # check if this is the last page pages = soup.find('span', {"class": "guiArw pageEndNext"}) if not pages is None: log.info("We reached last page.") sys.exit() # If not, return the url to the next page hrefs = div.findAll('a', href=True) for href in hrefs: next = str(page_count + 1) if href.find(text=True) == next: log.info("Next url is %s" % href['href']) return href['href']
def fasterBS(url, f): """fasterBS() - use BeautifulSoup to parse only anchor tags""" parsed = BeautifulSoup(f, parseOnlyThese=SoupStrainer('a')) links = [urljoin(url, x['href']) for x in parsed] output(links)
def inicio(self, widget): def message(msg, model): if model == 1: diag = gtk.MessageDialog(self.janela, gtk.DIALOG_MODAL, gtk.MESSAGE_WARNING, gtk.BUTTONS_OK) elif model == 2: diag = gtk.MessageDialog(self.janela, gtk.DIALOG_MODAL, gtk.MESSAGE_INFO, gtk.BUTTONS_OK) diag.set_markup(msg) diag.run() diag.destroy() day = datetime.date.today() month = datetime.date.today().month year = datetime.date.today().year year_month = str(year) + "-" + str(month) timeString = time.strftime('%H:%M:%S') timeString2 = time.strftime('%H-%M') arq_name = str(day) + "_" + timeString2 # key validation (off) # URL_KEY = 'http://www.ow7.com.br/loppe.html' # page_key = requests.get(URL_KEY) # bs_key = BeautifulSoup(page_key.content) # a_key = bs_key.find('span', {'id': 'MM'}).string # print a_key a_key = 'teste' a_url = 'http://online4.detran.pe.gov.br/' a_url = a_url + 'NovoSite/Detran_Veiculos/result_Consulta.aspx?placa=' if a_key != 'teste': message("Contacte o administrador.\n\nKleber Soares\n" "81 8172.9074\[email protected]", 1) else: try: if not os.path.exists(year_month): mkdir(year_month) arq = open(self.filechooserbutton.get_filename()) str_placas = arq.read() # strip() serve para remover as quebras das linhas placas = str_placas.strip().split(",") placas = [x for x in placas if x] qtd_placas = len(placas) arq.close() i = 0 lin = 1 wb = xlwt.Workbook() ws = wb.add_sheet('Detran Pernambuco') ws.write(0, 0, 'PLACA') ws.write(0, 1, 'RESTRICAO 1') ws.write(0, 2, 'RESTRICAO 2') ws.write(0, 3, 'RESTRICAO 3') ws.write(0, 4, 'RESTRICAO 4') ws.write(0, 5, 'DATA') ws.write(0, 6, 'HORA') for placa in placas: placa = placa.strip() i += 1 self.count_in_thread(qtd_placas) self.progress_bar.set_text( "("+placa+") "+str(i)+"/"+str(qtd_placas)) while gtk.events_pending(): gtk.main_iteration() URL_ULTIMOS_RESULTADOS = a_url + placa page = requests.get(URL_ULTIMOS_RESULTADOS) bs = BeautifulSoup(page.content) labels = ( bs.find('span', {'id': 'lblRestricao1'} ).find('font').string, bs.find('span', {'id': 'lblRestricao2'} ).find('font').string, bs.find('span', {'id': 'lblRestricao3'} ).find('font').string, bs.find('span', {'id': 'lblRestricao4'} ).find('font').string, ) # csv = placa+"," ws.write(lin, 0, placa) col = 1 for label in labels: if not label: ws.write(lin, col, label) col += 1 ws.write(lin, 5, str(day)) ws.write(lin, 6, timeString) lin += 1 sleep(1) wb.save(year_month+"/"+arq_name+".xls") message("Arquivo gerado com sucesso.\n" "Verifique a pasta do aplicativo.", 2) except TypeError, erro: if not self.filechooserbutton.get_filename(): message("Selecione um arquivo.", 1) else: print "Um erro ocorreu: %s" % erro message("Um erro ocorreu: %s" % erro, 1)
def extract_from_html(raw_html, base_url, only_links=True): """ Extract URLs from HTML. Implementation notes: - The current implementation is fault tolerant, meaning it will try to extract URLs even if the HTML is malformed and browsers wouldn't normally see those links. This may therefore result in some false positives. - HTML5 tags are supported, including tags not currently supported by any major browser. :param raw_html: Raw HTML data. :type raw_html: str :param base_url: Base URL for the current document. :type base_url: str :param only_links: If True, only extract links to other resources. If False, extract all URLs. :type only_links: bool :returns: Extracted URLs. :rtype: set(str) """ # Set where the URLs will be collected. result = set() add_result = result.add # Remove the fragment from the base URL. base_url = urldefrag(base_url)[0] # Parse the raw HTML. bs = BeautifulSoup(raw_html, convertEntities=BeautifulSoup.ALL_ENTITIES) # Some sets of tags and attributes to look for. href_tags = {"a", "link", "area"} src_tags = { "form", "script", "img", "iframe", "frame", "embed", "source", "track" } param_names = {"movie", "href", "link", "src", "url", "uri"} # Iterate once through all tags... for tag in bs.findAll(): # Get the tag name, case insensitive. name = tag.name.lower() # Extract the URL from each tag that has one. url = None if name in href_tags: url = tag.get("href", None) elif name in src_tags: url = tag.get("src", None) elif name == "param": name = tag.get("name", "").lower().strip() if name in param_names: url = tag.get("value", None) elif name == "object": url = tag.get("data", None) elif name == "applet": url = tag.get("code", None) elif name == "meta": name = tag.get("name", "").lower().strip() if name == "http-equiv": content = tag.get("content", "") p = content.find(";") if p >= 0: url = content[p + 1:] elif name == "base": url = tag.get("href", None) if url is not None: # Unicode URLs are not supported. try: url = str(url) except Exception: continue # Update the base URL. try: base_url = urljoin(base_url, url.strip(), allow_fragments=False) except Exception: continue # If we found an URL in this tag... if url is not None: # Unicode URLs are not supported. try: url = str(url) except Exception: continue # Canonicalize the URL. try: url = urljoin(base_url, url.strip()) except Exception: continue # Discard URLs that are not links to other pages or resources. if not only_links or is_link(url, base_url=base_url): # Add the URL to the set. add_result(url) # Return the set of collected URLs. return result
class Config(dict): """ 话说一米六二同志特别懒,这个配置文件的解析,就是证明: 这个配置文件解析类,居然就是用的BeautifulSoup来解析的,懒到家了.... @brief Example: //file demo.conf <root> <db type="string> host=localhost;user=root;pass=;db=test; </db> </root> #file test.py import hyer.config config=hyer.config.Config(open("demo.conf").read()) print config["root"]["db"].values() """ def __init__(self,content,last_find=None): self.content=content self.last_find=last_find self.soup= BeautifulSoup(self.content) self.builders={ "string":str, "regexp":regexp, "list":list, "python":eval, "regexp_list":regexp_list, "python_list":python_list } def __str__(self): return self.content def values(self): results=[] for item in self.last_find: try: type=item["type"] except: type="string" results.append(self.builders[type](R_TRIM.sub("",str(item)))) return results def value(self): item = self.last_find[0] try: type=item["type"] except: type="string" return self.builders[type](R_TRIM.sub("",str(item))) def sections(self): results=[] for item in self.last_find: results.append(str(item)) return results def __getitem__(self,key): data=self.soup.findAll(key) self.last_find=data if len(data)==0: return NoneConfig() else: return Config(str(data),self.last_find)
def Genre(self, genre, filter, page, totalpage): if 'Top' in genre: url = self.url_base + '/top50/' + self.genrelist[genre] if filter != "": url = url + '/' + str(filter) type = 'table' elif genre == 'Kijktips': url = self.url_base + '/kijktips/etalage' type = 'json' else: url = self.url_base + '/7dagen/' + self.genrelist[genre] if filter != "": url = url + ',' + str(filter) url = url + '?weergave=detail&page=' + str(page) type = 'ol' if type == 'json': data = tools.urlopen(self.app, url, {'cache': 3600, 'xhr': True}) json_data = json.loads(data) genrelist = [] if len(data) < 1: mc.ShowDialogNotification("No genre found for " + str(genre)) return genrelist for item in json_data: genreitem = CreateEpisode() if item['name'] != item['series_name']: genreitem.name = item['series_name'] + ': ' + item['name'] else: genreitem.name = item['name'] genreitem.id = self.url_base + item['link'] genreitem.description = item['contents'] genreitem.thumbnails = item['thumbnail'] genreitem.page = page genreitem.totalpage = totalpage genrelist.append(genreitem) return genrelist else: data = tools.urlopen(self.app, url, {'cache': 3600}) genrelist = [] if data == "": mc.ShowDialogNotification("No genre found for " + str(genre)) return genrelist soup = BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES, smartQuotesTo="xml") if totalpage == "": try: pagediv = soup.findAll('div', {'class': 'pagination'})[0] apage = pagediv.findAll("a") totalpage = int(apage[len(apage) - 2].contents[0]) except: totalpage = 1 if type == 'table': div_show = soup.find('table', {'class': 'episodes'}) list = div_show.findAll("tr") elif type == 'ol': div_show = soup.find('ol', {'class': 'broadcasts detail'}) list = div_show.findAll("li") for info in list: try: omroep = info.findAll( attrs={"class": "broadcaster-logo"})[0]['alt'] item = True except: item = False if item: if omroep == "Nederland 1": omroep = "nl1" elif omroep == "Nederland 2": omroep = "nl2" elif omroep == "Nederland 3": omroep = "nl3" try: thumb = info.findAll( attrs={"class": "thumbnail"})[0]['src'] except: thumb = info.findAll( attrs={"class": "thumbnail placeholder"})[0]['src'] path = self.url_base + info.find( attrs={"class": "thumbnail_wrapper"})['href'] if type == 'ol': title = info.findAll( attrs={"class": "series"})[0].contents[0] desc = info.find('div', { 'class': 'description' }).p.contents[0] date = info.find(attrs={ "class": "channel" }).contents[0].replace(' ', '').replace( '\n', '').replace('\t', '').replace('op', '').replace('om', '') if type == 'table': title = info.findAll( attrs={"class": "series"} )[0].contents[0] + ': [COLOR FFA6A6A6]' + info.find( 'a', { 'class': 'episode' }).contents[0] + '[/COLOR]' desc = '' date = info.find( 'td', {'class': 'right'})['title'].split(' ')[0] genreitem = CreateEpisode() genreitem.name = title genreitem.id = path genreitem.description = desc genreitem.thumbnails = thumb genreitem.date = date genreitem.filter = str(omroep).upper() genreitem.page = page genreitem.totalpage = totalpage genrelist.append(genreitem) return genrelist
def getData(user, pw): itemlist = [] # get HTML link = 'https://www.onlinetvrecorder.com/v2/?go=home' data = functions.getHTML(user, pw, link) # logged in result = data.replace('\'', '\"') soup = BeautifulSoup(result) # search for highlights tables = soup.findAll('div', {'class': 'content'}) for table in tables: # check its the right table taList = table.find('div', {'class': 'homedoublehighlight'}) if taList is not None: x = ItemClass() taList = table.find('td') sStyle = taList['style'].encode() m = re.search('background-image:url\((?P<thumb>.*?)\)', sStyle) if (m is not None): x.thumb = m.group('thumb') else: x.thumb = 'DefaultVideo.png' h1 = table.find('a') x.url = h1['href'] # we just want the id s = x.url.index('id=') x.url = x.url[s + 3:] sp = table.find('span') x.title = sp.text text1 = table.find('div', {'class': 'homedoublehighlight'}) x.text = text1.text x.vid = '' x.text = x.text.replace('|', '\n') itemlist.append(x) # search for actual movies content = soup.findAll('div', {'class': 'homethree'}) for c in content: x = ItemClass() link = c.find('a', {'class': 'homethreehredbig'}) if link is None: break x.url = link['href'] # we just want the id s = x.url.index('id=') x.url = x.url[s + 3:] title = c.find('div', {'class': 'toolbardiv'}) x.title = title.text data = c.findAll('div', {'class': 'homethreee'}) x.thumb = 'DefaultVideo.png' x.vid = '' for e in data: img = e.find('img') if img is not None: x.thumb = img['src'] else: sty = e['style'] m = re.search('background-image:url\((?P<thumb>.*?)\)', sty) if (m is not None): x.thumb = m.group('thumb') vid = e.find('video') if vid is not None: x.vid = vid['src'] desc = c.find('div', {'class': 'homethreec'}) x.text = desc.text x.text = x.text.replace('|', '\n') itemlist.append(x) return itemlist
#!/bin/env python import pdfkit import requests import getpass from BeautifulSoup import BeautifulSoup import sys user = sys.argv[2] urlroot = sys.argv[1] passwd = getpass.getpass() auth=(user,passwd) r=requests.get("%s/ringsheets"%urlroot, auth=auth, verify=False) soup = BeautifulSoup(r.text) for x in soup.findAll('a')[5:]: path = x.get('href') print path filename = path.replace('/','_').replace('?','_').replace('&','_').replace('=','_') options = { 'page-size': 'Letter', 'margin-top': '0.5in', 'margin-right': '0.5in', 'margin-bottom': '0.5in', 'margin-left': '0.5in', 'encoding': "UTF-8", 'username': user, 'password': passwd, 'zoom': '.9' } pdfkit.from_url("%s/%s"%(urlroot,path), "%s.pdf"%filename , options=options)
def __unicode__(self): str = unicode(BeautifulSoup(self.text,convertEntities=BeautifulSoup.HTML_ENTITIES)) return nltk.clean_html(str)
def getMoreData(user, pw, page): # page 1 display page without big highlights itemlist = [] # init browser br = mechanize.Browser() br.set_handle_robots(False) br.open("https://www.onlinetvrecorder.com/v2/?go=home") # login br.select_form('fhomelogin') br['email'] = user br['password'] = pw br.submit().read() select = 0 # -2 returns also the highlights if (page == 2): select = 14 if (page > 2): select = (15 * (page - 1)) - 1 params = {u'language': 'de', u'start': str(select)} data = urllib.urlencode(params) response = br.open( "https://www.onlinetvrecorder.com/v2/ajax/get_homethree.php", data) result = response.read() # logged in result = result.replace('\'', '\"') soup = BeautifulSoup(result) # search for actual movies content = soup.findAll('div', {'class': 'homethree'}) for c in content: x = ItemClass() link = c.find('a', {'class': 'homethreehredbig'}) if link is None: break x.url = link['href'] # we just want the id s = x.url.index('id=') x.url = x.url[s + 3:] title = c.find('div', {'class': 'toolbardiv'}) x.title = title.text data = c.findAll('div', {'class': 'homethreee'}) x.thumb = 'DefaultVideo.png' for e in data: img = e.find('img') if img is not None: x.thumb = img['src'] else: sty = e['style'] m = re.search('background-image:url\((?P<thumb>.*?)\)', sty) if (m is not None): x.thumb = m.group('thumb') vid = e.find('video') if vid is not None: x.vid = vid['src'] desc = c.find('div', {'class': 'homethreec'}) x.text = desc.text itemlist.append(x) return itemlist
def get_bill_info(self, chamber, session, bill_id): print 'Getting %s %s' % (session, bill_id) detail_url = 'http://www.leginfo.ca.gov/cgi-bin/postquery?bill_number=%s_%s&sess=%s' % ( bill_id[:2].lower(), bill_id[2:], session.replace('-', '')) # Get the details page and parse it with BeautifulSoup. These # pages contain a malformed 'p' tag that (certain versions of) # BS choke on, so we replace it with a regex before parsing. details_raw = urllib2.urlopen(detail_url).read() details_raw = details_raw.replace('<P ALIGN=CENTER">', '') details = BeautifulSoup(details_raw) # Get the history page (following a link from the details page). # Once again, we remove tags that BeautifulSoup chokes on # (including all meta tags, because bills with quotation marks # in the title come to us w/ malformed meta tags) hist_link = details.find(href=re.compile("_history.html")) hist_url = 'http://www.leginfo.ca.gov%s' % hist_link['href'] history_raw = urllib2.urlopen(hist_url).read() history_raw = history_raw.replace( '<! ****** document data starts here ******>', '') rem_meta = re.compile('</title>.*</head>', re.MULTILINE | re.DOTALL) history_raw = rem_meta.sub('</title></head>', history_raw) history = BeautifulSoup(history_raw) # Find title and add bill title_match = re.search('TOPIC\t:\s(\w.+\n(\t\w.*\n){0,})', history_raw, re.MULTILINE) bill_title = title_match.group(1).replace('\n', '').replace('\t', ' ') self.add_bill(chamber, session, bill_id, bill_title) # Find author (primary sponsor) sponsor_match = re.search('^AUTHOR\t:\s(.*)$', history_raw, re.MULTILINE) bill_sponsor = sponsor_match.group(1) self.add_sponsorship(chamber, session, bill_id, 'primary', bill_sponsor) # Get all versions of the bill text_re = '%s_%s_bill\w*\.html' % (bill_id[:2].lower(), bill_id[2:]) links = details.find(text='Bill Text').parent.findAllNext( href=re.compile(text_re)) for link in links: version_url = "http://www.leginfo.ca.gov%s" % link['href'] # This name is not necessarily unique (for example, there may # be many versions called simply "Amended"). Perhaps we should # add a date or something to make it unique? version_name = link.parent.previousSibling.previousSibling.b.font.string self.add_bill_version(chamber, session, bill_id, version_name, version_url) # Get bill actions action_re = re.compile( '(\d{4})|([\w.]{4,6}\s+\d{1,2})\s+(.*(\n\s+.*){0,})', re.MULTILINE) act_year = None for act_match in action_re.finditer(history.find('pre').contents[0]): # If we didn't match group 2 then this must be a year change if act_match.group(2) == None: act_year = act_match.group(1) continue # If not year change, must be an action act_date = act_match.group(2) action = act_match.group(3).replace('\n', '').replace(' ', ' ').replace( '\t', ' ') self.add_action(chamber, session, bill_id, chamber, action, act_date)
def scrape(output_file, url_id, url_end, type): with open(output_file, 'wb') as csvfile: w = unicodecsv.writer(csvfile, encoding='utf-8') headers = [ 'county', 'office', 'district', 'party', 'candidate', 'votes' ] w.writerow(headers) for i in range(len(counties)): url = '' if counties[i] == 'Santa Fe': url = 'http://www.sos.state.nm.us/uploads/FileLinks/' + url_id + '/conty000' + url_end + '.htm' else: url = 'http://www.sos.state.nm.us/uploads/FileLinks/' + url_id + '/conty0' + getCounty( i) + '.HTM' + url_end + '.html' r = requests.get(url) soup = BeautifulSoup(r.text) hed = str(soup.find('h2')) tables = soup.findAll('table') if type == 'general': tables = tables[:len(tables) - 2] count = 0 for table in tables: count = count + 1 office_district = '' district = '' if count > 1: office_district = table.findAll('h2')[0].getText().split( '-') else: office_district = ['PRESIDENT OF THE UNITED STATES'] if len(office_district) > 1: if len(office_district) > 1: if office_district[1].split(' ')[ 1] == 'DISTRICT' or office_district[1].split( ' ')[1] == 'DIVISION': district = office_district[1].split(' ')[-1] if district not in district_exclusions: district = int(district) else: district = '' for row in table.findAll('tr'): col = row.findAll('td') county = counties[i] office = office_district[0] party = clean(col[1]).strip() candidate = clean(col[0]).strip() votes = clean(col[2]).strip() if candidate: w.writerow([ county, office.strip(), district, party, candidate, votes ])
def parseLoginError(self, res): page = BeautifulSoup(res.read()) r = page.findAll('span', attrs={'class': 'error'}) return r
def get_soup(url): """Request webpage and return a Beautiful Soup object.""" resp = requests.get(url) soup = BeautifulSoup(resp.text) return soup
t = date.today() + timedelta(days=1) sevenday = t + timedelta(days=7) opd_date = str(t.timetuple().tm_year - 1911) + t.strftime("%m%d") opd_date2 = str(sevenday.timetuple().tm_year - 1911) + sevenday.strftime("%m%d") data = { 'Opd_date': opd_date, 'Opd_date2': opd_date2, 'dept_code': dept_code, 'doc_code': '', 'Submit1': '確認送出' } page = fetchPOSTHtml("http://www.wanfang.gov.tw/W402008web_new/opdreg.asp", data) soup = BeautifulSoup(page) table = soup.findAll('tr', align="middle")[0].parent time = str(int(time_shift[0:4]) - 1911) + time_shift[5:7] + time_shift[8:10] shift = ord(time_shift[11:12]) - 64 tr = table.findAll(lambda tag: tag.text.find(time) > -1) if tr == []: status = 1 message = u"找不到可掛號時段." else: a = tr[0].contents[shift * 2 + 1].findAll(attrs={'href': re.compile(doct_code)}) if a == []: status = 1 message = u"找不到可掛號時段!" else:
def loadVideos(url, name): #try: newlink = url xbmc.executebuiltin( "XBMC.Notification(Please Wait!,Loading selected video)") print newlink playtype = "direct" if (newlink.find("dailymotion") > -1): match = re.compile( '(dailymotion\.com\/(watch\?(.*&)?v=|(embed|v|user)\/))([^\?&"\'>]+)' ).findall(newlink) lastmatch = match[0][len(match[0]) - 1] link = 'http://www.dailymotion.com/' + str(lastmatch) req = urllib2.Request(link) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3' ) response = urllib2.urlopen(req) link = response.read() response.close() sequence = re.compile('"sequence", "(.+?)"').findall(link) newseqeunce = urllib.unquote(sequence[0]).decode('utf8').replace( '\\/', '/') #print 'in dailymontion:' + str(newseqeunce) imgSrc = re.compile('"videoPreviewURL":"(.+?)"').findall(newseqeunce) if (len(imgSrc[0]) == 0): imgSrc = re.compile('/jpeg" href="(.+?)"').findall(link) dm_low = re.compile('"sdURL":"(.+?)"').findall(newseqeunce) dm_high = re.compile('"hqURL":"(.+?)"').findall(newseqeunce) vidlink = urllib2.unquote(dm_low[0]).decode("utf8") elif (newlink.find("4shared") > -1): d = xbmcgui.Dialog() d.ok('Not Implemented', 'Sorry 4Shared links', ' not implemented yet') elif (newlink.find("docs.google.com") > -1 or newlink.find("drive.google.com") > -1): docid = re.compile('/d/(.+?)/preview').findall(newlink)[0] cj = cookielib.LWPCookieJar() (cj, vidcontent) = GetContent2( "https://docs.google.com/get_video_info?docid=" + docid, "", cj) html = urllib2.unquote(vidcontent) cookiestr = "" try: html = html.encode("utf-8", "ignore") except: pass stream_map = re.compile('fmt_stream_map=(.+?)&fmt_list').findall(html) if (len(stream_map) > 0): formatArray = stream_map[0].replace("\/", "/").split(',') for formatContent in formatArray: formatContentInfo = formatContent.split('|') qual = formatContentInfo[0] url = (formatContentInfo[1]).decode('unicode-escape') else: cj = cookielib.LWPCookieJar() newlink1 = "https://docs.google.com/uc?export=download&id=" + docid (cj, vidcontent) = GetContent2(newlink1, newlink, cj) soup = BeautifulSoup(vidcontent) downloadlink = soup.findAll('a', {"id": "uc-download-link"})[0] newlink2 = "https://docs.google.com" + downloadlink["href"] url = GetDirVideoUrl(newlink2, cj) for cookie in cj: cookiestr += '%s=%s;' % (cookie.name, cookie.value) vidlink = url + ('|Cookie=%s' % cookiestr) elif (newlink.find("vimeo") > -1): idmatch = re.compile( "http://player.vimeo.com/video/([^\?&\"\'>]+)").findall(newlink) if (len(idmatch) > 0): playVideo('vimeo', idmatch[0]) elif (newlink.find("youtube") > -1) and (newlink.find("playlists") > -1): playlistid = re.compile('playlists/(.+?)\?v').findall(newlink) vidlink = "plugin://plugin.video.youtube?path=/root/video&action=play_all&playlist=" + playlistid[ 0] elif (newlink.find("youtube") > -1) and (newlink.find("list=") > -1): playlistid = re.compile('videoseries\?list=(.+?)&').findall(newlink + "&") vidlink = "plugin://plugin.video.youtube?path=/root/video&action=play_all&playlist=" + playlistid[ 0] elif (newlink.find("youtube") > -1) and (newlink.find("/p/") > -1): playlistid = re.compile('/p/(.+?)\?').findall(newlink) vidlink = "plugin://plugin.video.youtube?path=/root/video&action=play_all&playlist=" + playlistid[ 0] elif (newlink.find("youtube") > -1) and (newlink.find("/embed/") > -1): playlistid = re.compile('/embed/(.+?)\?').findall(newlink + "?") vidlink = getYoutube(playlistid[0]) elif (newlink.find("youtube") > -1): match = re.compile( '(youtu\.be\/|youtube-nocookie\.com\/|youtube\.com\/(watch\?(.*&)?v=|(embed|v|user)\/))([^\?&"\'>]+)' ).findall(newlink) if (len(match) == 0): match = re.compile( 'http://www.youtube.com/watch\?v=(.+?)&dk;').findall(newlink1) if (len(match) > 0): lastmatch = match[0][len(match[0]) - 1].replace('v/', '') print "in youtube" + lastmatch[0] vidlink = lastmatch playtype = "youtube" else: sources = [] label = name hosted_media = urlresolver.HostedMediaFile(url=newlink, title=label) sources.append(hosted_media) source = urlresolver.choose_source(sources) print "inresolver=" + newlink if source: vidlink = source.resolve() else: vidlink = "" playVideo(playtype, vidlink)
def convert(text): global g_allIgnoredP # make some magic with text - to simplyfy parsing # remove <p><! p. xxx !></p> text = text[text.find("!>") + 2:] # remove page info i = text.find("<!") while -1 != i: i2 = text[i + 1:].find("!>") assert -1 != i2 i2 += i + 1 text = text[:i] + text[i2 + 2:] i = text.find("<!") text = text.replace("\n\n<p></p>", "") # move blockquotes to one p with def text = text.replace("</p>\n\n<p><blockquote>", "\n\n<blockquote>") # move col to one p with def text = text.replace("</p>\n\n<p><col>", "\n\n<col>") # move Syn. to one p with def text = text.replace("</p>\n\n<p><b>Syn.", "\n\n<b>Syn.") print " start parsing (feed soup - it may take a while)" # start parsing soup = BeautifulSoup() soup.feed(text) print " soup feeded" pList = soup.fetch("p") currentPos = "ignore" currentWord = "" currentDef = "" currentQuotes = [] # add word # addWord(currentWord, currentPos, currentDef, currentQuotes) counter = 0 for p in pList: counter += 1 if counter % 2000 == 0: print " counter: %d\t Last word: %s" % (counter, currentWord) pos = p.first("pos") if pos: currentPos = getPos(getAllTextFromTag(pos)) if currentPos != "ignore": hw = p.first("hw") if hw: txt = getAllTextFromTag(hw) currentWord = removeAccents(txt) defs = p.first("def") currentDef = "" if defs: currentDef = getAllTextFromTag(defs) currentQuotes = [] for q in p.fetch("blockquote"): currentQuotes.append(getQuote(q)) if currentDef != "": if currentDef.startswith("See "): handleSeeWord(currentWord, currentPos, currentDef, currentQuotes) else: addWord(currentWord, currentPos, currentDef, currentQuotes) else: g_allIgnoredP += str(p) + "\n\n"
# Set the user-agent as Mozilla - if the page knows we're Mechanize, it won't return all fields br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] #open the URL previously defined as 'starting_url' br.open(starting_url) #find out and display (print) the names of any forms in the HTML #i.e. <form ... name=" print "All forms:", [form.name for form in br.forms()] #as it happens, the name of the form in this page is... "form" br.select_form(name="form") #submit the form and put the contents into 'response' response = br.submit() #create soup object by reading the contents of response and passing it through BeautifulSoup soup = BeautifulSoup(br.response().read()) # Have a look at 'soup': note the 'onSubmit' JavaScript function that is called when # you click on the 'next' link. We'll mimic this in the function above. print soup # START scraping by running scrape_table function created above scrape_table(soup) #If we wanted to scrape more than one results page we would replace the previous line #with this function, which would in turn run the other function #scrape_and_look_for_next_link(soup) #if we need to print contents of form so we can see what it contains before next step #print br.form #if the form requires certain fields to be filled/selected, then we would do so here #like so: br["ctl00$phMainContent$dropDownAwardDate"] = ["Between"] #see https://scraperwiki.com/scrapers/new/python?template=tutorial-mechanize#