def lyrics(): """ Extracts lyrics from the current sont in 'bide et musique' """ res = "" page = urllib.urlopen(HOME_PAGE) content = page.read() page.close() soup = BeautifulSoup(content) souptitle = soup.findAll("p", {"class": "titre-song"})[0] title = souptitle.text artist = soup.findAll("p", {"class": "titre-song2"})[0].text souptitle = soup.findAll("p", {"class": "titre-song"})[0] url = "http://www.bide-et-musique.com" url = "%s%s" % (url, souptitle.a.get("href")) page = urllib.urlopen(url) content = page.read() page.close() soup = BeautifulSoup(content) tab = soup.findAll("td", {"class": "paroles"}) if tab == []: res = "Pas de paroles disponibles pour %s de %s" % (artist, title) else: tab = tab[0].contents res = "%s - %s\n%s\n" % (artist, title, "*" * 30) lyrics_content = "" for elt in tab: tmp = elt if str(tmp).lstrip() != "<br />": lyrics_content += xhtml2text(unicode(tmp).lstrip()) + "\n" res += lyrics_content return xhtml2text(res)
def getAvailabilityRank(table): try: #print "getting List of ATMs requires attention..." soup = BeautifulSoup(str(table)) rows = soup.findAll('tr') numRows = getRowsNumber(table) numRowsHead = getRowsHeadNumber(table) arrBestBranchBri = [] for a in range (2, numRows-1): trs = BeautifulSoup(str(rows[a])) tdcells = trs.findAll("td") percentAvailBri = float(tdcells[17].getText()) ukerName = cleanUpNamaUker(tdcells[0].getText()) if (percentAvailBri == 100.00): #arrBestBranch.append(ukerName+", "+jumlahATM) arrBestBranchBri.append(ukerName) except IndexError: arrBestBranchBri = getAvailabilityRank(table) return sorted(arrBestBranchBri)
def firstn(self, count=5, **kwargs): url = "http://iwiw.hu/search/pages/user/ajaxsearch.jsp?do=AdvancedSearch&page=0&" url += urlencode(kwargs) iwiwsearch = urlfetch.fetch(url, headers={'Cookie': self.logincookie}).content #try: leves = BeautifulSoup(iwiwsearch) mennyivan = len(leves.findAll("div", "cardContainer")) count=int(count) if mennyivan - count >= 0: mennyit = count if mennyivan - count < 0: mennyit = mennyivan results = [] for i in range(mennyit): ez = leves.findAll("div", "cardContainer")[i] # userid = ez.find("a")["name"].replace("uid","") ebben_van_a_popup_url = ez.findChildren("a")[2]["onclick"] pic_popup_url = re.search("'.*?'", ebben_van_a_popup_url).group(0) pic_thumbnail = ez.img["src"] name = ez.findChildren("a")[1].contents[0] profile_url = ez.findChildren("a")[1]["href"] result = { "name": name, "profile_url": profile_url, "pic_thumbnail": pic_thumbnail, "pic_popup_url": pic_popup_url } # result beleírása results.append(result) return results
def getCDMStats(table): soup = BeautifulSoup(str(table)) rows = soup.findAll('tr') numRows = getRowsNumber(table) numRowsHead = getRowsHeadNumber(table) #print numRowsHead, numRows msgBody = "" for i in range (0, numRows): trs = BeautifulSoup(str(rows[i])) tdcells = trs.findAll("td") thcells = trs.findAll("th") #print len(tdcells), len(thcells) if thcells: msgBody += "\n*" + thcells[0].getText().upper() + "*\n----------------------------------\n" if tdcells: if len(tdcells) > 1: msgBody += tdcells[0].getText().upper()+": "+ asterisk(tdcells[1].getText()) +"\n" return msgBody.replace("_"," ")
def getMovieData(self): list = [] #-- get serial play list & parameters ------------------------------------- html = self.Auth.get_HTML(self.serial_url, None, 'http://serialu.net/media/uppod.swf') # -- parsing web page html = re.compile('<body>(.+?)<\/body>', re.MULTILINE|re.DOTALL).findall(html)[0] soup = BeautifulSoup(html) pl_url = '' is_multiseason = len(soup.findAll('object', {'type':'application/x-shockwave-flash'})) for rec in soup.findAll('object', {'type':'application/x-shockwave-flash'}): if is_multiseason > 1: season = rec.parent.previousSibling.previousSibling.text+r' ' else: season = r'' for par in rec.find('param', {'name':'flashvars'})['value'].split('&'): if par.split('=')[0] == 'pl': pl_url = par[3:] if pl_url.find('http:') == -1: pl_url = xppod.Decode(pl_url) #-- get playlist details --------------------------------------------------- html = self.Auth.get_HTML(pl_url, None, 'http://serialu.net/media/uppod.swf') self.pl_url = pl_url # -- check if playlist is encoded if html.find('{"playlist":[') == -1: html = xppod.Decode(html).encode('utf-8').split(' or ')[0] #-- TODO: make smart choice # -- parsing web page s_url = '' s_num = 0 movie_list = [] for rec in re.compile('{(.+?)}', re.MULTILINE|re.DOTALL).findall(html.replace('{"playlist":[', '')): for par in rec.replace('"','').split(','): if par.split(':')[0]== 'comment': name = str(s_num+1) + ' серия' #par.split(':')[1]+' ' if par.split(':')[0]== 'file': if 'http' in par.split(':')[1]: s_url = par.split(':')[1]+':'+par.split(':')[2] else: s_url = xppod.Decode(par.split(':')[1]).split(' or ')[0] s_num += 1 # mark part for history name = season.encode('utf-8') + name movie_list.append({'movie_name': name, 'url': s_url}) #if h_part <> '-': # if name == h_part: # name = '[COLOR FF00FF00]'+name+'[/COLOR]' #-- parse data list.append({'name':self.serial_name, 'img': self.serial_img, 'descr': self.serial_descr, 'season_number':s_num, 'name_orig':'', 'movie': movie_list}) #-- return movie list return list
def parseImgLinks(self,depth=1): url_response = None try: url_response = urllib2.urlopen(self.scrap_url,timeout=self._timeout) except Exception as e: print(" [ERROR]: Could not open {0}: {1}".format(self.scrap_url,e.reason)) return self.img_list html_parse = BeautifulSoup(url_response) unique_images_found = 0 total_images_found = 0 self.visited[self.scrap_url] = 1 for img in html_parse.findAll('img'): try: abs_url = urljoin(self.scrap_url,img['src']) if urlparse(img['src']).netloc == "" else img['src'] if abs_url not in self.img_list: self.img_list.add(abs_url) unique_images_found += 1 total_images_found += 1 except: pass print(" [Found %d images / %d new]: %s" % (total_images_found,unique_images_found,self.scrap_url)) if depth > 1: for a in html_parse.findAll('a'): try: if (urlparse(a['href']).netloc == "") or (urlparse(self.scrape_url_orig).netloc == urlparse(a['href']).netloc): self.scrap_url = urljoin(self.scrape_url_orig,a['href']) if self.scrap_url in self.visited: continue self.parseImgLinks(depth - 1) except: pass return self.img_list
def get_submission_info(self): # Only valid after self.execute() # Parse submission information out of response. soup = BeautifulSoup(self.res_data) info = {} # Get submission status. tag = soup.findAll(id="submission_status")[0] info['status'] = tag.text.strip() # Get compilation text. tags = soup.findAll(id="compilation") if tags: content = tags[0] info['compile_output'] = content.pre.text.strip() else: info['compile_output'] = None # Get evaluation results. evaluations = [] tags = soup.findAll(id=re.compile(r"^eval_outcome_")) text_tags = soup.findAll(id=re.compile(r"^eval_text_")) for outcome_tag, text_tag in zip(tags, text_tags): # Get evaluation text also. evaluations.append({ 'outcome': outcome_tag.text.strip(), 'text': text_tag.text.strip(), }) info['evaluations'] = evaluations return info
def __call__(self): html = self.render() soup = BeautifulSoup(html) for img in soup.findAll('img', {'class' : 'leadimage'}): img['hspace'] = 8 img['vspace'] = 8 utm = self.getUTM(source='newsletter', medium='email', campaign=self.newsletter_title); for a in soup.findAll('a'): if '?' in a['href']: a['href'] = '%s&%s' % (a['href'], utm) else: a['href'] = '%s?%s' % (a['href'], utm) html = premailer.transform(soup.prettify()) tags = ['dl', 'dt', 'dd'] for tag in tags: html = html.replace("<%s" % tag, "<div") html = html.replace("</%s" % tag, "</div") return html
def LyricWikia(artist, title): url = 'http://lyrics.wikia.com/api.php?action=lyrics&artist={artist}&song={title}&fmt=json&func=getSong'.format(artist=artist, title=title).replace(" ","%20") r = requests.get(url, timeout=15) # We got some bad formatted JSON data... So we need to fix stuff :/ returned = r.text returned = returned.replace("\'", "\"") returned = returned.replace("song = ", "") returned = json.loads(returned) if returned["lyrics"] != "Not found": # set the url to the url we just recieved, and retrieving it r = requests.get(returned["url"], timeout=15) soup = BeautifulSoup(r.text) soup = soup.find("div", {"class": "lyricbox"}) [elem.extract() for elem in soup.findAll('div')] [elem.replaceWith('\n') for elem in soup.findAll('br')] #with old BeautifulSoup the following is needed..? For recent versions, this isn't needed/doesn't work try: soup = BeautifulSoup(str(soup), convertEntities=BeautifulSoup.HTML_ENTITIES) except: pass soup = BeautifulSoup(re.sub(r'(<!--[.\s\S]*-->)', '', str(soup))) [elem.extract() for elem in soup.findAll('script')] return(soup.getText()) else: return()
def parse(self, html): """ Parse the information table on USFIRSTs site to extract team information. Return a list of dictionaries of team data. """ teams = list() soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) for title in soup.findAll('title'): if "FRC Team/Event List" not in title.string: return None team_rows = soup.findAll("pre")[0].string.split("\n") for line in team_rows[2:]: # first is blank, second is headers. data = line.split("\t") if len(data) > 1: try: teams.append({ "team_number": int(data[1]), "name": data[2], "short_name": data[3], "nickname": data[7] }) except Exception, e: logging.warning("Failed to parse team row: %s" % data)
def searchcrawler(url,keyword=''): """ tb搜索页爬虫 """ html=get_html(url) #print html if html: soup = BeautifulSoup(html,fromEncoding='gbk') items_row = soup.findAll('div',{'class':'row item icon-datalink'}) if items_row: print '=======================row search row==========================' #print items for item in items_row: item_info = item.find('div',{'class':'col title'}).h3.a item_url = item_info['href'] url_info = urlparse.urlparse(item_url) item_id = urlparse.parse_qs(url_info.query,True)['id'][0] print item_url print item_id judge_site(item_url,keyword) items_col = soup.findAll('div',{'class':'col item icon-datalink'}) if items_col: print '=======================row search col==========================' #print items for item in items_col: item_info = item.find('div',{'class':'item-box'}).h3.a item_url = item_info['href'] url_info = urlparse.urlparse(item_url) item_id = urlparse.parse_qs(url_info.query,True)['id'][0] print item_url print item_id judge_site(item_url,keyword)
def httpParserGuokr(url): ''' 果壳:content title id updated ''' content=httpRequest(url)#发送请求 soup=BeautifulSoup(content) item=soup.findAll('content') title=soup.findAll('title') link=soup.findAll('id') pubDate=soup.findAll('updated') article=title[0].text content=[] counter=0 for t in range(len(title)): if t!=0 and t<=6: num=7-t string=httpXpath(item[num-1].contents[0].replace('&','&').replace( '<','<').replace('>','>').replace('"','"')) newtime=datetime.datetime.strptime(pubDate[num].contents[0][:10],"%Y-%m-%d") newtime=newtime.strftime('%Y年%m月%d日') arr={} arr['article']=article arr['title']=title[num].contents[0] arr['link']=link[num].contents[0] arr['pubdate']=newtime arr['brief']=string arr['descr']=item[num-1].contents[0].encode('utf-8').replace('&','&').replace( '<','<').replace('>','>').replace('"','"') arr['content']="" if counter==0: status=Article.objects.filter(link=link[num].text).exists() if not status: RssData().keepData(arr) counter=1 else: RssData().keepData(arr)
def replaceURL(URL,OUTPUT): # Provide user feedback print "[+] Replacing the URLs in the HTML source." print "[+] URLs that will be replaced:" # Open source, read lines, and begin parsing to replace all URLs for scripts and links try: # Print href URLs that will be replaced print "\n".join(re.findall('<a href="?\'?([^"\'>]*)', open(OUTPUT).read())) with open(OUTPUT, "r+b") as html: # Read in the source html and parse with BeautifulSoup soup = BeautifulSoup(html) # Find all links and replace URLs with our new text/URLs for link in soup.findAll('a', href=True): link['href'] = '{{links.phishgate}}' for link in soup.findAll('link', href=True): link['href'] = urlparse.urljoin(URL, link['href']) for link in soup.findAll('script', src=True): link['src'] = urlparse.urljoin(URL, link['src']) source = soup.prettify() source = xml.sax.saxutils.unescape(source) # Write the updated URLs to output file while removing the [' and '] output = open(OUTPUT, "w") output.write(source.replace('[','').replace(']','')) output.close() print "[+] URL parsing successful. All URLs have been replaced." except: print "[-] URL parsing failed. Make sure the html file exists and is readable."
def parseing(url,id,ResultList): pDict={} req = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"}) con = urllib2.urlopen( req ) html = con.read() #print html soup = BeautifulSoup(html) images = soup.findAll('img', {'class':'img-responsive screengrab'}) #for image in images: # print image.get('src') image_url = 'https://worldofvnc.net/' + images[0].get('src') download_photo(image_url,str(id)+'.jpg') ConvertImage(str(id)) pDict.update({'image_url':image_url}) pDict.update({'local_image_name':str(id)+'.png'}) listdata= soup.findAll('li', {'class':'list-group-item'}) for x in listdata: #print x.b.contents[0], x.contents[1] pDict.update({x.b.contents[0]:x.contents[1]}) ResultList.append(pDict) #dataDict.update({'VPN_ID':str(id), 'VPN_Content':pDict}) #print "inner data" #print ResultList return ResultList
def _parseHtml (self, html): """Parse HTML""" ret = [] soup = BeautifulSoup (html) for div in soup.findAll ("div", {"class": "giaeffettuate"}) + soup.findAll ("div", {"class": "corpocentrale"}): station = div.find ("h2") station = str (station.contents[0]) # Now get the time prog = None real = None tag = None for p in div.findAll ("p"): t = str (p.contents[0]) time = p.find ("strong") if len (time.contents) > 0: time = str (time.contents[0]) else: time = "00:00" if re.search ("(?i)programmat(a|o)", t): prog = time.rstrip().lstrip() elif re.search ("(?i)effettiv(a|o)", t): real = time.rstrip().lstrip() tag = "eff" elif re.search ("(?i)previst(a|o)", t): real = time.rstrip().lstrip() tag = "est" assert (prog is not None and real is not None and tag is not None) print station print prog print real print tag e = (station, prog, real, self.timediff (prog, real), tag) ret.append (e) return ret
def read_annotation(self): files = [f for f in self.annotation_set if isfile(join(self.annotation_path, f))] cfile = 1 nfile = len(files) for fi in files: #print "Reading trainval "+str(cfile)+"/"+str(nfile) cfile+=1 file_name = self.annotation_path + fi f = open(file_name, 'r') content = f.read() y = BeautifulSoup(str(content)) img = self.image_path + y.filename.string for obj in y.findAll('object'): bs_obj = BeautifulSoup(str(obj)) name = bs_obj.findAll('name') if name[0].string == self.cls: cls = 1 else: cls = 0 bndbox = bs_obj.findAll('bndbox') (x,y,w,h) = ( int(bndbox[0].xmax.string), int(bndbox[0].xmin.string), int(bndbox[0].ymax.string), int(bndbox[0].ymin.string) ) data = img, (x,y,w,h), cls if cls == 1: self.train_set_image_pos.append(data) else: self.train_set_image_neg.append(data)
def makeHTMLQuestion(fn, htmldata): soup = BeautifulSoup(htmldata) #add JS soup.find('body')['onload'] = "populateAssignmentID('myAssignmentId')" soup.find('head').insert(0, SUBMIT_JS) #replace forms forms = soup.findAll('form') if forms: for form in forms: if not form.has_key('method'): form['method'] = 'POST' if not form.has_key('action'): if testmode: form['action'] = 'http://workersandbox.mturk.com/mturk/externalSubmit' else: form['action'] = 'http://www.mturk.com/mturk/externalSubmit' if not form.has_key('onSubmit'): form['onSubmit'] = "return verifyTurkSubmit('myAssignmentId');" inputtag = Tag(soup,'input') inputtag['type'] = 'hidden' inputtag['name'] = 'assignmentId' inputtag['id'] = 'myAssignmentId' inputtag['value'] = '' form.insert(0, inputtag) mainurl = uploadfile(fn, str(soup)) for sub in soup.findAll('img'): # TODO fn = dirname(fn) + '/' + sub['src'] uploadfile(fn) return ExternalQuestion(escape(mainurl), frame_height)
def getPengelolaSupervisi(strTID): try: strURL = "http://172.18.65.42/statusatm/viewatmdetail.pl?ATM_NUM="+strTID strHTML = fetchHTML(strURL) table = getLargestTable(getTableList(strHTML)) strHTMLTableRows = getSpecificRows(table, getRowIndex(table, "Replenish By")) mysoup = BeautifulSoup(strHTMLTableRows) arrTDs = mysoup.findAll('td') strPengelola = arrTDs[1].getText() strHTMLTableRows = getSpecificRows(table, getRowIndex(table, "KC Supervisi")) mysoup = BeautifulSoup(strHTMLTableRows) arrTDs = mysoup.findAll('td') strSupervisi = arrTDs[1].getText() except IndexError: strPengelola, strSupervisi = getPengelolaSupervisi(strTID) except RuntimeError: strPengelola, strSupervisi = "---", "KANWIL BRI JAKARTA III" return strPengelola, strSupervisi
def _parse(self, html): soup = bs4.BeautifulSoup(html) print_link = soup.findAll('a', text='Print')[0].get('href') html2 = grab_url(print_link) logger.debug('got html 2') # Now we have to switch back to bs3. Hilarious. # and the labeled encoding is wrong, so force utf-8. soup = BeautifulSoup(html2, convertEntities=BeautifulSoup.HTML_ENTITIES, fromEncoding='utf-8') self.meta = soup.findAll('meta') p_tags = soup.findAll('p')[1:] real_p_tags = [p for p in p_tags if not p.findAll(attrs={'class':"twitter-follow-button"})] self.title = soup.find('strong').getText() entity = soup.find('span', attrs={'class':'author'}) children = list(entity.childGenerator()) try: self.byline = 'By ' + children[1].getText() except IndexError: self.byline = '' self.date = children[-1].strip() self.body = '\n'+'\n\n'.join([p.getText() for p in real_p_tags])
def getATMProbUKOCRO2(table): try: #print "getting List of ATMs requires attention..." soup = BeautifulSoup(str(table)) rows = soup.findAll('tr') numRows = getRowsNumber(table) numRowsHead = getRowsHeadNumber(table) numProbUKO = 0 numProbCRO = 0 for i in range (numRowsHead, numRows): trs = BeautifulSoup(str(rows[i])) tdcells = trs.findAll("td") if "ATM CENTER" in tdcells[8].getText(): numProbCRO = numProbCRO + 1 numProbUKO = numRows - numProbCRO -numRowsHead #print "number of CRO problem(s)", numProbCRO, "number of UKO problem(s):", numProbUKO except IndexError: numProbUKO, numProbCRO = getATMProbUKOCRO(table) return int(numProbUKO), int(numProbCRO)
def get(self): content = self.request.content soup = BeautifulSoup(''.join(content)) #所有text已经被自动转为unicode,如果需要,可以自行转码encode(xxx) title = soup.html.body.h1 if not title: return title = title.text subtitle = soup.findAll(attrs={'class':'f_cy f_s16b'})[0].string description = soup.find(attrs={'class':'f_cy f_s14 pt20'}) description = description.text if description else '' smooth_index = soup.findAll(attrs={'class':'pt20'})[0] smooth_index = smooth_index.text if smooth_index else '' information = soup.findAll(attrs={'class':'pt20'})[1] information = information.text if information else '' tips = soup.find(attrs={'class':'f_s14 pt20'}) tips = tips.text + tips.nextSibling.nextSibling.text if tips else '' # pics = soup.findAll('a', href = re.compile(r'pic\d')) pics = soup.findAll(attrs={'class':'pic1'}) if pics: imageList = [] for pic in pics: img = pic.find('img')['src'] imageList.append(img) spider.put(HTTP%img) self.page.append((self.request.url, title, subtitle, description, smooth_index, information, tips, imageList))
def sanitize_html(value): from BeautifulSoup import BeautifulSoup, Comment, Tag # FIXME: 'None' should never be saved as text if value is None: return "" # allowed tags for a Vodafone Live <CONTAINER type="data" /> # this doubles up as a translation table. CKEditor does new-ish # HTML than Vodafone Live will accept. We have to translate 'em' back # to 'i', and 'strong' back to 'b'. # # NOTE: Order is important since <strong>'s can be inside <p>'s. tags = ( ("em", "i"), # when creating them in the editor they're EMs ("strong", "b"), ("i", "i"), # when loading them as I's the editor leaves them ("b", "b"), # we keep them here to prevent them from being removed ("u", "u"), ("br", "br"), ("p", "p"), ) valid_tags = [tag for tag, replacement_tag in tags] soup = BeautifulSoup(value) # remove all comments from the HTML for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): comment.extract() # hide all tags that aren't in the allowed list, but keep # their contents for tag in soup.findAll(True): # Vodafone Live allows for no tag attributes tag.attrs = [] if tag.name not in valid_tags: tag.hidden = True # replace tags with Vlive equivelants for element, replacement_element in tags: if element is not replacement_element: for tag in soup.findAll(element): replacement_tag = Tag(soup, replacement_element) replacement_tag.insert(0, tag.text) tag.replaceWith(replacement_tag) xml = soup.renderContents().decode("utf8") fragment, errors = tidy_fragment(xml, {"char-encoding": "utf8"}) return ( fragment.replace(" ", " ") .replace("’", "'") .replace("‘", "'") .replace(""", '"') .replace("“", '"') .replace("”", '"') .replace("•", "- ") .replace("é", "e") .replace("É", "E") .replace("–", "-") )
def baseurl(html, base): if not base.endswith('/'): base += '/' absurl = re.compile(r'\s*[a-zA-Z][a-zA-Z0-9\+\.\-]*:') # Starts with scheme:. def isabs(url): return url.startswith('/') or absurl.match(url) soup = BeautifulSoup(html) for link in soup.findAll('a', href=True): if not isabs(link['href']): link['href'] = base + link['href'] for img in soup.findAll('img', src=True): if not isabs(img['src']): img['src'] = base + img['src'] elements = soup.findAll(style=True) # All styled elements. for e in elements: def func(m): url = m.group(2) if not isabs(url): url = base + url return m.group(1) + url + m.group(3) e['style'] = re.sub(r'''(url\(\s*)([^\s\)\"\']*)(\s*\))''', func, e['style']) e['style'] = re.sub(r'''(url\(\s*")([^\s\"]*)("\s*\))''', func, e['style']) e['style'] = re.sub(r'''(url\(\s*')([^\s\']*)('\s*\))''', func, e['style']) return unicode(soup)
def content(self, val): """Set content""" soup = BeautifulSoup(val) for todo in soup.findAll('en-todo'): todo.name = 'input' todo['type'] = 'checkbox' if todo.get('checked') == 'false': del todo['checked'] self.changed_by_default = True for media in soup.findAll('en-media'): if media.get('hash'): # evernote android app error media.name = 'img' res = self.parent.resource_edit.get_by_hash(media['hash']) # shit! if res: if media['type'].find('image') == 0: media['src'] = 'file://%s' % res.file_path else: media['src'] = file_icon_path media['title'] = res.file_name res.in_content = True else: media['src'] = '' media['title'] = '' else: media.hidden = True # set tables id's for identifing on hover for num, table in enumerate(soup.findAll('table')): table['id'] = 'table_%d' % num self._last_table_num = num self._content = re.sub( r'( | ){5}', '<img class="tab" />', unicode(soup).replace(u'\xa0', ' '), ) # shit! self.apply()
def test_projects_access_restrictions(self): # Given an org with two projects project1 = Project.objects.create(title='Project 1') project1.publish() project2 = Project.objects.create(title='Project 2') project2.publish() Partnership.objects.create(organisation=self.org, project=project1) Partnership.objects.create(organisation=self.org, project=project2) Employment.objects.create( user=self.user3, organisation=self.org, group=self.user_group, is_approved=True ) # When project1 is added to user1's project white list white_list = UserProjects.objects.create(user=self.user3, is_restricted=True,) white_list.projects.add(project1) # Then user1's project list should include only project1 self.c.login(username=self.user3.username, password=self.password) url = '/myrsr/projects' response = self.c.get(url, follow=True) from BeautifulSoup import BeautifulSoup soup = BeautifulSoup(response.content) self.assertEqual(len(soup.findAll('table')), 1) # There should be two table rows: one header and one for project1 self.assertEqual(len(soup.findAll('table')[0].findChildren('tr')), 2)
def extract_label(self, url): rsp = self.ua.open(url) text = rsp.read() soup = BeautifulSoup(text) label = soup.findAll('div', {'id': 'gallery-label'}) label = label[0] label = label.text tombstone = soup.findAll('div', {'class': 'tombstone'}) tombstone = tombstone[0] sections = tombstone.findAll('div') data = { 'Chat': label } for sect in sections: text = sect.text stuff = text.split(":") k = stuff[0] v = stuff[1] data[k] = v return data
def getATMStats(table): soup = BeautifulSoup(str(table)) rows = soup.findAll('tr') numRows = getRowsNumber(table) numCols = getColsNumber(table) numRowsHead = getRowsHeadNumber(table) #print numRowsHead, numRows msgBody = "" seqNo = 0 for i in range (0, numRows): trs = BeautifulSoup(str(rows[i])) tdcells = trs.findAll("td") thcells = trs.findAll("th") #print len(tdcells), len(thcells) if tdcells: msgBody += "\n"+tdcells[0].getText().upper()+") "+ tdcells[1].getText()+", " + tdcells[2].getText().replace("HYOSUNG","HYOSUNG ") +"\nLOKASI: "+ tdcells[4].getText() +"\nAREA: "+ tdcells[5].getText()+"\nDURASI: "+ tdcells[6].getText().replace("days","hari ").replace("hours","jam") +"\n"+"\n" if msgBody == "": msgBody = "Tidak ada ATM PROBLEM OPS di wilayah kerja Anda." return msgBody
def wiki_links(text, group=None): """Replaces CamelCase words to wiki-links.""" from BeautifulSoup import BeautifulSoup autoescape = False safe_input = isinstance(text, SafeData) conditional_escape(text) soup = BeautifulSoup(text) for url in soup.findAll(text=wikiword_link): if url.parent.name == 'a': continue new_str = wikiword_link.sub(curry(_re_callback, inside=False, group=group), url) url.replaceWith(BeautifulSoup(new_str)) soup = BeautifulSoup(str(soup)) # Fixed for bug in some versions of BS for a in soup.findAll('a'): url = a.get('href') if not url: continue new_str = wikiword_link_href.sub(curry(_re_callback, inside=True, group=group), url) if new_str != url: a['href'] = new_str result = force_unicode(soup) if safe_input: result = mark_safe(result) elif autoescape: result = escape(result) return result
def GET(self): i = web.input() appid = i.get('appid').rstrip() email, url, title = i.get('appdata', '||').split('|') userhash = i.get('userhash') ts = i.get('ts') token = i.get('token') query = urllib.urlencode(dict(url=url, title=title)) if not token: raise web.seeother('/share?%s' % query) #XXX: security verification etc.. url = yahooLoginURL(email, '/WSLogin/V1/wspwtoken_login', token) try: resp = urllib2.urlopen(url) except: helpers.set_msg('Authorization Failed.') raise web.seeother('/share?%s' % query) content = resp.read() soup = BeautifulSoup(content) aurl = 'http://address.yahooapis.com/v1/searchContacts?format=json' wssid = soup.findAll('wssid')[0].contents[0] cookie =soup.findAll('cookie')[0].contents[0] cookie = cookie.strip() furl = aurl + '&fields=email,name&email.present=1&appid=%s&WSSID=%s' % (appid, wssid) req = urllib2.Request(furl) req.add_header('Cookie', cookie) req.add_header('Content-Type', 'application/json') response = urllib2.urlopen(req).read() contacts = self.get_contacts(response) save_contacts(email, contacts, provider='YAHOO') raise web.seeother('/share?%s' % query)
def parse_site(url, sql_connection): sql_cursor = sql_connection.cursor() page_html = urllib2.urlopen(url) soup = BeautifulSoup(page_html) pages = [] page_nums = [] for raw_a in soup.findAll('td', {'class':'page'}): page_num_text = raw_a.text if page_num_text.encode('utf-8').strip() == u'···'.encode('utf-8').strip(): pass else: page_num = int(page_num_text) if page_nums and (page_num - page_nums[-1]) > 1: for i in xrange(page_nums[-1], page_num + 1): pages.append(url + 'index' + str(i) + ".html") page_nums.append(page_num) pages.append(PORTAL_NAME + str(raw_a.a['href'])) pages = unique(pages) pages.append(url) for pg in pages: print pg ps = BeautifulSoup(urllib2.urlopen(pg)) for item in ps.findAll('a', {'class':'itemname'}): try: print item.contents[0].strip() print item.span.string sql_cursor.execute("INSERT INTO parsed(site, program, version) VALUES(?, ?, ?)", [pg, item.contents[0].strip(), item.span.string]) except AttributeError as e: sql_connection.rollback() continue else: sql_connection.commit()
def html_analysis_average(html_path): ouput_filename = html_path + "-links.txt" f = open(ouput_filename, 'w') count_file_number = 0 count_externaljavascript = 0 count_cssexternallink = 0 count_login_number = 0 contain_login_number = 0 tofinda = 'www' tofindb = 'com' tofindc = 'log' tofindd = 'sign' tofinde = 'register' for filename in os.listdir(html_path): count_page_valid = 0 count_externallink = 0 count_internallink = 0 count_file_number = count_file_number + 1 print filename try: soup = BeautifulSoup(file(html_path + filename)) for link in soup.findAll('a'): if (link.get('href') != None): if tofinda in link.get('href') or tofindb in link.get( 'href'): count_externallink = count_externallink + 1 count_page_valid = count_page_valid + 1 if tofinda not in link.get( 'href') and tofindb not in link.get('href'): count_internallink = count_internallink + 1 count_page_valid = count_page_valid + 1 if (link.get('src') != None): if tofinda in link.get('src') or tofindb in link.get( 'src'): count_externallink = count_externallink + 1 count_page_valid = count_page_valid + 1 if tofinda not in link.get( 'src') or tofindb not in link.get('src'): count_internallink = count_internallink + 1 count_page_valid = count_page_valid + 1 for link in soup.findAll('img'): if (link.get('src') != None): if tofinda in link.get('src') or tofindb in link.get( 'src'): count_externallink = count_externallink + 1 count_page_valid = count_page_valid + 1 if tofinda not in link.get( 'src') or tofindb not in link.get('src'): count_internallink = count_internallink + 1 count_page_valid = count_page_valid + 1 for link in soup.findAll('img'): if (link.get('href') != None): if tofinda in link.get('href') or tofindb in link.get( 'href'): count_externallink = count_externallink + 1 count_page_valid = count_page_valid + 1 if tofinda not in link.get( 'href') or tofindb not in link.get('href'): count_internallink = count_internallink + 1 count_page_valid = count_page_valid + 1 for link in soup.findAll('div'): if (link.get('href') != None): if tofinda in link.get('href') or tofindb in link.get( 'href'): count_externallink = count_externallink + 1 count_page_valid = count_page_valid + 1 if tofinda not in link.get( 'href') and tofindb not in link.get('href'): count_internallink = count_internallink + 1 count_page_valid = count_page_valid + 1 if (link.get('src') != None): if tofinda in link.get('src') or tofindb in link.get( 'src'): count_externallink = count_externallink + 1 count_page_valid = count_page_valid + 1 if tofinda not in link.get( 'src') or tofindb not in link.get('src'): count_internallink = count_internallink + 1 count_page_valid = count_page_valid + 1 for link in soup.findAll('script'): if (link.get('src') != None): if tofinda in link.get('src') or tofindb in link.get( 'src'): count_externaljavascript = count_externaljavascript + 1 count_page_valid = count_page_valid + 1 if (link.get('href') != None): if tofinda in link.get('href') or tofindb in link.get( 'href'): count_externaljavascript = count_externaljavascript + 1 count_page_valid = count_page_valid + 1 for link in soup.findAll('link'): if (link.get('href') != None): if tofinda in link.get('href') or tofindb in link.get( 'href'): count_cssexternallink = count_cssexternallink + 1 count_page_valid = count_page_valid + 1 if (link.get('src') != None): if tofinda in link.get('src') or tofindb in link.get( 'src'): count_cssexternallink = count_cssexternallink + 1 count_page_valid = count_page_valid + 1 contain_login = False for link in soup.findAll('form'): if (link.get('name') != None): if tofindc in link.get('name') or tofindd in link.get( 'name') or tofinde in link.get('name'): count_login_number = count_login_number + 1 count_page_valid = count_page_valid + 1 contain_login = True elif (link.get('action') != None): if tofindc in link.get('action') or tofindd in link.get( 'action') or tofinde in link.get('action'): count_login_number = count_login_number + 1 count_page_valid = count_page_valid + 1 contain_login = True for link in soup.findAll('div'): if (link.get('name') != None): if tofindc in link.get('name') or tofindd in link.get( 'name') or tofinde in link.get('name'): count_login_number = count_login_number + 1 count_page_valid = count_page_valid + 1 contain_login = True elif (link.get('action') != None): if tofindc in link.get('action') or tofindd in link.get( 'action') or tofinde in link.get('action'): count_login_number = count_login_number + 1 count_page_valid = count_page_valid + 1 contain_login = True if contain_login == True: contain_login_number = contain_login_number + 1 if count_page_valid != 0: f.write(str(count_externallink - count_internallink) + ", ") except: pass f.close()
def parseSourceCode(soup,rankIndexValue, name , reviewerID , TotalReviews , HelpfulVotes , crNumPercentHelpful , crNumFanVoters): soupParsed = BeautifulSoup(soup) # the second table is for the review content datapart= soupParsed.findAll("table",{"class":"small"}) # review part is marked by "<div style="margin-left:0.5em;">" reviewParts= soupParsed.findAll("div",{"style":"margin-left:0.5em;"}) # for the review section of each product, there are two table with class value of small # so table 1, 4, 7, 10 is for the product link # in the product section, if the product is a vine free product, there is no price tag. productInfo=[] for productIndex in range(0,10): try: #productIndex=1 productPart = datapart[productIndex*3+1] productName= productPart.text price = productName[productName.find("Price:")+6:] price.replace("$","") productName = productName[:productName.find("Price:")] #print len(datapart) #print "----------------------------------------------" #print productPart productLink = productPart.find("a")['href'] #print productLink #productInfo.append([productName,productLink]) #reviewPartsIndex=0 reviewPart= reviewParts[productIndex] starReview=reviewPart.find("img")["title"] #print starReview reviewTime = reviewPart.find("nobr").text isVerifiedPurchase="Not Verified" CountVerifiedPurchase = reviewPart.findAll("span",{"class":"crVerifiedStripe"}) if CountVerifiedPurchase: #isVerifiedPurchase = reviewPart.find("span",{"class":"crVerifiedStripe"}).text isVerifiedPurchase ="Verified Purchase" print 'isVerifiedPurchase:',isVerifiedPurchase,'\n' #print reviewTime AllReviewText = reviewParts[productIndex].text if "Vine Customer Review of Free Product" in AllReviewText: IsVineReviewFreeProduct="YesVineReviewFreeProduct" else: IsVineReviewFreeProduct="NoVineReviewFreeProduct" #print IsVineReviewFreeProduct reviewText=reviewPart.find("div",{"class":"reviewText"}).text print "productName,",productName,"\n" print "productLink:\n", productLink,"\n" print "starReview:\n",starReview,"\n" print "reviewTime:\n",reviewTime,"\n" print "IsVineReviewFreeProduct:\n",IsVineReviewFreeProduct,"\n" print reviewText print "------------------------------------------------------------" productInfo.append([rankIndexValue, name , reviewerID , TotalReviews , HelpfulVotes , crNumPercentHelpful , crNumFanVoters,productName,price,productLink,starReview,reviewTime,isVerifiedPurchase,IsVineReviewFreeProduct,reviewText]) except: print 'failed this product section' #productReviewInfo=zip(productInfo,reviewInfo) #break return productInfo
return None def Load_Video( url ): print "Load_Video=" + url try: response = urllib2.urlopen(url) html = response.read() except urllib2.HTTPError, e: html = e.fp.read() pass soup = BeautifulSoup( html ) sourceVideos = [] # Handle href tags for a in soup.findAll('a', href=True): if a['href'].find("youtu.be") != -1: sourceVideos.append( a['href'].split()[0] ) if a['href'].find("youtube") != -1: sourceVideos.append( a['href'].split()[0] ) if a['href'].find("dailymotion") != -1: sourceVideos.append( a['href'].split()[0] ) if a['href'].find("tamildbox") != -1: src = a['href'].split()[0] print "tamildbox", src resp = net.http_GET( src ) dbox = resp.content sourceVideos += re.compile( '<iframe(.+?)>').findall( dbox )
#lists for mangas. urlChapters = [] # list of urls. urlChapter = [] # url of current chapter. urlImage = [] # list of images. request = urllib2.Request(target_url+'/'+ target_info, headers = headers) response = urllib2.urlopen(request) doc = response.read() #print doc ### prints the source of the page. soupDoc = BeautifulSoup(doc) # soup to parse the html. #soup = soup.prettify() # resource from http://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautiful-soup #'a' for all the items, then href for the links. links = soupDoc.findAll('a') #making a list of all Links on manga. #print links for link in links: #print link urlChapters.append(target_url+link['href']) #href - relates the links (look at page source) def getChapter(url): ''' A method to get all the usable links to later parse only the images. ''' print url chapterurl = url #the url of the chapter. reqChapter = urllib2.Request(chapterurl,headers = headers)#requesting. responseChapter = urllib2.urlopen(reqChapter) #returns the request.
def random_quote(jenni, cat): if cat is not None: if cat not in CATS: jenni.say("I don't know that category, please select from one of: {0}".format(', '.join(CATS))) return else: cat = CATS[randrange(len(CATS))] page_title = page_id = None # First drill down to the lowest category while(True): try: cat_url = BASE_URL + SUBCATS % cat content = json.loads(urllib.urlopen(cat_url).read()) cat_members = content["query"]["categorymembers"] # Select at random random_member = choose_random_member(cat_members) if random_member is None: jenni.say("An error occurred fetching a subcategory") return if random_member["type"] == "subcat": cat = random_member["title"] else: page_title = random_member["title"] page_id = random_member["pageid"] break except Exception as e: jenni.say("An error occurred fetching a quote: {0}".format(e)) return # Next select a random quote from the page try: page_url = BASE_URL + SECTIONS % page_id content = json.loads(urllib.urlopen(page_url).read()) sections = content["parse"]["sections"] quote = None num_tries = 0 while quote == None and num_tries < MAX_TRIES: section = choose_random_section(sections) if section is None: jenni.say("We accidentally chose a page with no quotes, sorry about that!") return section_index = randrange(len(sections)) + 1 section_url = BASE_URL + SECTION % (page_id, section_index) content = json.loads(urllib.urlopen(section_url).read()) section_title = content["parse"]["title"] html = Soup(content["parse"]["text"]["*"]) all_quotes = [] for ul in html.findAll('ul'): for li in ul.findAll('li'): all_quotes.append(li.text) for dd in html.findAll('dd'): all_quotes.append(dd.text.replace("<b>","").replace("</b>","")) len_all_quotes = len(all_quotes) if len_all_quotes == 0: num_tries += 1 else: quote = all_quotes[randrange(len_all_quotes)] if quote is None: jenni.say("We accidentally chose a section of a page with no quotes, sorry about that!") return jenni.say("{0}: {1}".format(section_title, quote.encode('utf-8'))) except Exception as e: jenni.say("An error occurred fetching a quote: {0}".format(e)) return
threadurl = 'http://www.okcupid.com/messages?readmsg=true&threadid=' + threadid + '&folder=2' threadreq = urllib2.build_opener() threadreq.addheaders.append(OkCupid.cookietuple) returnedthreadpage = threadreq.open(threadurl) threadsoup = BeautifulSoup(returnedthreadpage.read()) threadmsgs = threadsoup.findAll('div', {'class': 'message'}) username = threadsoup.find('a', { 'class': 'buddyname' }).text.encode("utf-8") receiverid = threadsoup.find( 'input', {'name': 'receiverid'})['value'].encode("utf-8") if len(threadmsgs) == 2: deletethread(threadid, username, receiverid) if __name__ == '__main__': pagecount = 1 while True: request = urllib2.build_opener() request.addheaders.append(OkCupid.cookietuple) sentmessagespage = request.open( 'http://www.okcupid.com/messages?low=' + str(pagecount) + '&folder=2') soup = BeautifulSoup(sentmessagespage.read()) msghtml = soup.findAll('li', {'class': re.compile('^readMessage')}) if len(msghtml) == 0: break for thread in msghtml: checkthread(thread['id'].encode('utf-8').replace('message_', '')) pagecount += 30
#just checking my urls - Accidentally created infinite loop. funtimes. #print baseurl3 # grab lyrics lyrics = soup2.findAll("div",{"class":"body_lyr"}) # No lyrics? stop program! if(len(lyrics) == 0): exit; # for each article... for entry in lyrics: mc2 = str(entry) # find the name of the product mosoup = BeautifulSoup(mc2) #remove comments & all other stuff comments = mosoup.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] for script in mosoup("script"): mosoup.script.extract() for style in mosoup("style"): mosoup.style.extract() for iframe in mosoup("iframe"): mosoup.iframe.extract() for h4 in mosoup("h4"): mosoup.h4.extract() for h5 in mosoup("h5"): mosoup.h5.extract() for h2 in mosoup("h2"): mosoup.h2.extract() for a in mosoup("a"):
soup = BeautifulSoup(response) print soup.html.head.title.string class Person: def __init__(self, vorname, nachname, email, city): self.vorname = vorname self.nachname = nachname self.email = email self.city = city persons = [] for link in soup.findAll("a"): if link.string == "See full profile": person_url = "https://scrapebook22.appspot.com" + link["href"] person_html = urlopen(person_url).read() person_soup = BeautifulSoup(person_html) name = person_soup.find("div", attrs={"class": "col-md-8"}).h1.string email = person_soup.find("span", attrs={"class": "email"}).string city = person_soup.find("span", attrs={"data-city": True}).string vorname, nachname = name.split(" ") person = Person(vorname=vorname, nachname=nachname, email=email, city=city)
from BeautifulSoup import BeautifulSoup from urllib2 import urlopen url = 'https://scrapebook22.appspot.com/' response = urlopen(url).read() soup = BeautifulSoup(response) csv_file = open("email_list.csv", "w") for link in soup.findAll("a"): if link.string == "See full profile": link_url = "https://scrapebook22.appspot.com/" + link["href"] link_html = urlopen(link_url).read() link_soup = BeautifulSoup(link_html) name = link_soup.findAll("h1")[1].string email = link_soup.find("span", attrs={"class": "email"}).string city = link_soup.find("span", attrs={"data-city": True}).string csv_file.write(name + "," + email + "," + city + "\n")
def downloadFirstAlbumImage(albumUrl, localFileName): albumSource = requests.get(albumUrl).text soup = BeautifulSoup(albumSource) matches = soup.findAll('link', rel='image_src') downloadImage(matches[0]['href'], localFileName)
def HTML_TO_TEXT(html): soup = BeautifulSoup(html) text_parts = soup.findAll(text=True) text = ''.join(text_parts) return text
import re, os report_date = date(2008, 12, 31) crawl_date = date(2009, 3, 17) nonnumeric = re.compile('[^0-9]') for institution in Institution.objects.exclude(ots_number=''): if institution.institutionassethistory_set.all().count() == 0 and institution.transaction_set.all().count() > 0: url = 'http://www.ots.treas.gov/?p=InstitutionSearch&hid=%s' % (institution.ots_number) data = urllib.urlopen(url) soup = BeautifulSoup(data.read()) print institution.ots_number institutionProfile = soup.findAll("table", attrs={"class":"institutionProfile"}) if len(institutionProfile) > 0: assets = nonnumeric.sub('', institutionProfile[0].findAll("td")[4].string.strip()) print assets InstitutionAssetHistory.objects.create(institution=institution, report_date=report_date, crawl_date=crawl_date, total_deposits=None, total_assets=assets)
import urllib2 import re from BeautifulSoup import BeautifulSoup from htmltreediff import diff from kitchen.text.converters import to_unicode f1 = open('travel.html', 'rw') v1 = f1.read() #def scrape(url=None, keywords=[], frequency=None, email=None): # User inputs URL #page = urllib2.urlopen("http://labor.ny.gov/app/warn/") page = urllib2.urlopen("http://travel.state.gov/travel/cis_pa_tw/tw/tw_1764.html") # Scrape URL, all of it soup = BeautifulSoup(page) # Find keywords #warns_nyc = soup.findAll(text=re.compile("New York City")) warns_travel = soup.findAll(text=re.compile("Lebanon")) print diff(to_unicode(v1), to_unicode(page), pretty=True) f1.close()
print ii doc = db[id] if doc.has_key('type') and doc['type'] == 'decision' and doc.has_key( '_attachments'): atts = doc['_attachments'] hts = atts.keys() at = db.get_attachment(id, hts[0]) s = at.read() soup = BeautifulSoup(s) court_found = False # Search first in the "court" class courts_html = br.sub( ' ', str(" ".join([str(ss) for ss in soup.findAll("p", "court")]))) # Standard court = cir.findall(courts_html) if len(court) > 0: court_found = True court_names[court[0].lower()] = 1 # Customs & Patent if not court_found: court = cus.findall(courts_html) if len(court) > 0: court_found = True court_names[court[0].lower()] = 1 # Claims
def find_entries(): page = 1 total_entry_count = 0 entries = [] while True: print " ---> Found %s entries so far. Now on page: %s" % (len(entries), page) knight_url = "http://newschallenge.tumblr.com/page/%s" % (page) html = requests.get(knight_url).content soup = BeautifulSoup(html) postboxes = soup.findAll("div", "postbox") # Done if only sticky entry is left. if len(postboxes) <= 1: break page += 1 # 15 entries per page, plus a sticky throwaway entry for entry in postboxes: if 'stickyPost' in entry.get('class'): continue total_entry_count += 1 likes = entry.find("", "home-likes") if likes and likes.text: likes = int(likes.text) else: likes = 0 comments = entry.find("", "home-comments") if comments and comments.text: comments = int(comments.text) else: comments = 0 title = entry.find("h2") if title: title = title.text url = entry.find('a', "home-view") if url: url = url.get('href') # Only record active entries if comments or likes: entries.append({ 'likes': likes, 'comments': comments, 'title': title, 'url': url, }) # time.sleep(random.randint(0, 2)) entries.sort(key=lambda e: e['comments'] + e['likes']) entries.reverse() active_entry_count = len(entries) found_entries = [] winner_count = 0 for i, entry in enumerate(entries): is_winner = entry['url'] in winners if is_winner: winner_count += 1 print " * %s#%s: %s likes - [%s](%s)%s" % ( "**" if is_winner else "", i + 1, entry['likes'], entry['title'], entry['url'], "**" if is_winner else "") found_entries.append(entry) print " ***> Found %s active entries among %s total applications with %s/%s winners." % ( active_entry_count, total_entry_count, winner_count, len(winners)) return found_entries
#print headers # get all test cases resp, content = html.request(TEST_CASE_VO_URL, "GET", headers=headers) #pp(content) tcs = content.split('Data/Test/') for tc in tcs: bs = BeautifulSoup(tc) category = bs.find(name='attribute',attrs={'name':'Category.Name'}) if category: # Only pick Automated test cases if category.text == 'Automated': tcid = tc.split('" id=')[0] att_tokens = bs.findAll('attribute') attrs_dict = {} for att_token in att_tokens: attrs_dict[att_token['name']] = \ remove_html_tags(html_decoder(att_token.text)) print '{name}: "{text}"'.format(name=att_token['name'], text=attrs_dict[att_token['name']]) # Execute test steps steps = attrs_dict['Steps'].split('\n') tc_passed = True tc_actual_output = '' for step in steps: if step: print step status, output = commands.getstatusoutput(step)
'mediaset extra':'http://www.staseraintv.com/programmi_stasera_mediaset_extra.html', 'sportitalia':'http://www.staseraintv.com/programmi_stasera_sportitalia.html', 'cielo':'http://www.staseraintv.com/programmi_stasera_cielo.html', 'italia 1':'http://www.staseraintv.com/programmi_stasera_italia1.html', 'italia 2':'http://www.staseraintv.com/programmi_stasera_italia2.html', 'mtv':'http://www.staseraintv.com/programmi_stasera_mtv.html', 'la 7':'http://www.staseraintv.com/programmi_stasera_la7.html', 'la 7D':'http://www.staseraintv.com/programmi_stasera_la7d.html' } day = datetime.now(timezone('Europe/Rome')).strftime('%Y-%m-%d') sourcedata = 'italian_tv_schedules' for tv in television.keys(): html = scraperwiki.scrape(television[tv]) soup = BeautifulSoup(html) tds = soup.findAll('table')[4].findAll('tr')[0].findAll('td') schedule_table = tds[len(tds)-1].findAll('small') schedule = schedule_table[len(schedule_table)-1] for s in schedule: v = str(s).replace('\r\n','') if (v.find(' - ') > 1): info = v.split(' - ') if (len(info) > 2): what = "" for i in range(len(info)): if (i > 1): what += " - " + info[i] else: if (i == 1): what += info[i] else:
month['04'] = 'Apr' month['05'] = 'May' month['06'] = 'Jun' month['07'] = 'Jul' month['08'] = 'Aug' month['09'] = 'Sep' month['10'] = 'Oct' month['11'] = 'Nov' month['12'] = 'Dec' for j in range(len(html)): out = urlopen(html[j]).read() soup = BeautifulSoup(out) res = soup.findAll('a') fr= [] date=[] for k in range(len(res)): if res[k].has_key('href'): ab = res[k]['href'] ab = ab.strip('..') ba = re.findall('\d\d\d\d\d\d/\d\d\d\d\d\d[a-z]\.html|\d\d\d\d\d\d/\d\d\d\d\d\d\.html', str(ab)) if len(ba)>0 : fr.append(ab.encode('UTF-8')) att = ab emp = re.findall('\d\d\d\d\d\d[a-z]\.html|\d\d\d\d\d\d\.html', str(att)) almost = re.sub('\W', '', emp[0].strip('.html')) almost = almost.strip('a').strip('b').strip('c').strip('d').strip('e').strip('f') mons = month[almost[0:2]] day = almost[2:4]
def fetch_youtube(self, address): username = None channel_id = None list_id = None if 'gdata.youtube.com' in address: try: username_groups = re.search('gdata.youtube.com/feeds/\w+/users/(\w+)/', address) if not username_groups: return username = username_groups.group(1) except IndexError: return elif 'youtube.com/feeds/videos.xml?user='******'user'][0] except IndexError: return elif 'youtube.com/feeds/videos.xml?channel_id=' in address: try: channel_id = urlparse.parse_qs(urlparse.urlparse(address).query)['channel_id'][0] except (IndexError, KeyError): return elif 'youtube.com/playlist' in address: try: list_id = urlparse.parse_qs(urlparse.urlparse(address).query)['list'][0] except IndexError: return elif 'youtube.com/feeds/videos.xml?playlist_id' in address: try: list_id = urlparse.parse_qs(urlparse.urlparse(address).query)['playlist_id'][0] except IndexError: return if channel_id: video_ids_xml = requests.get("https://www.youtube.com/feeds/videos.xml?channel_id=%s" % channel_id) channel_json = requests.get("https://www.googleapis.com/youtube/v3/channels?part=snippet&id=%s&key=%s" % (channel_id, settings.YOUTUBE_API_KEY)) channel = json.decode(channel_json.content) try: username = channel['items'][0]['snippet']['title'] description = channel['items'][0]['snippet']['description'] except (IndexError, KeyError): return elif list_id: playlist_json = requests.get("https://www.googleapis.com/youtube/v3/playlists?part=snippet&id=%s&key=%s" % (list_id, settings.YOUTUBE_API_KEY)) playlist = json.decode(playlist_json.content) try: username = playlist['items'][0]['snippet']['title'] description = playlist['items'][0]['snippet']['description'] except (IndexError, KeyError): return channel_url = "https://www.youtube.com/playlist?list=%s" % list_id elif username: video_ids_xml = requests.get("https://www.youtube.com/feeds/videos.xml?user=%s" % username) description = "YouTube videos uploaded by %s" % username else: return if list_id: playlist_json = requests.get("https://www.googleapis.com/youtube/v3/playlistItems?part=snippet&playlistId=%s&key=%s" % (list_id, settings.YOUTUBE_API_KEY)) playlist = json.decode(playlist_json.content) try: video_ids = [video['snippet']['resourceId']['videoId'] for video in playlist['items']] except (IndexError, KeyError): return else: if video_ids_xml.status_code != 200: return video_ids_soup = BeautifulSoup(video_ids_xml.content) channel_url = video_ids_soup.find('author').find('uri').getText() video_ids = [] for video_id in video_ids_soup.findAll('yt:videoid'): video_ids.append(video_id.getText()) videos_json = requests.get("https://www.googleapis.com/youtube/v3/videos?part=contentDetails%%2Csnippet&id=%s&key=%s" % (','.join(video_ids), settings.YOUTUBE_API_KEY)) videos = json.decode(videos_json.content) if 'error' in videos: logging.debug(" ***> ~FRYoutube returned an error: ~FM~SB%s" % (videos)) return data = {} data['title'] = ("%s's YouTube Videos" % username if 'Uploads' not in username else username) data['link'] = channel_url data['description'] = description data['lastBuildDate'] = datetime.datetime.utcnow() data['generator'] = 'NewsBlur YouTube API v3 Decrapifier - %s' % settings.NEWSBLUR_URL data['docs'] = None data['feed_url'] = address rss = feedgenerator.Atom1Feed(**data) for video in videos['items']: thumbnail = video['snippet']['thumbnails'].get('maxres') if not thumbnail: thumbnail = video['snippet']['thumbnails'].get('high') if not thumbnail: thumbnail = video['snippet']['thumbnails'].get('medium') duration_sec = isodate.parse_duration(video['contentDetails']['duration']).seconds if duration_sec >= 3600: hours = (duration_sec / 3600) minutes = (duration_sec - (hours*3600)) / 60 seconds = duration_sec - (hours*3600) - (minutes*60) duration = "%s:%s:%s" % (hours, '{0:02d}'.format(minutes), '{0:02d}'.format(seconds)) else: minutes = duration_sec / 60 seconds = duration_sec - (minutes*60) duration = "%s:%s" % ('{0:02d}'.format(minutes), '{0:02d}'.format(seconds)) content = """<div class="NB-youtube-player"><iframe allowfullscreen="true" src="%s?iv_load_policy=3"></iframe></div> <div class="NB-youtube-stats"><small> <b>From:</b> <a href="%s">%s</a><br /> <b>Duration:</b> %s<br /> </small></div><hr> <div class="NB-youtube-description">%s</div> <img src="%s" style="display:none" />""" % ( ("https://www.youtube.com/embed/" + video['id']), channel_url, username, duration, linkify(linebreaks(video['snippet']['description'])), thumbnail['url'] if thumbnail else "", ) link = "http://www.youtube.com/watch?v=%s" % video['id'] story_data = { 'title': video['snippet']['title'], 'link': link, 'description': content, 'author_name': username, 'categories': [], 'unique_id': "tag:youtube.com,2008:video:%s" % video['id'], 'pubdate': dateutil.parser.parse(video['snippet']['publishedAt']), } rss.add_item(**story_data) return rss.writeString('utf-8')
#print Fore.YELLOW + dd, Fore.WHITE #c[1].show() home = os.path.expanduser("~") local = home + "/Downloads/TEMP/"+filename c[1].write_image(local, format="svg", width=1024, height = 768) comment = "[[User_talk:Fæ/DrugStats|DrugStats]] chart for {} {}".format(title, date) pywikibot.setAction(comment) if len(sys.argv)<2: up(local, filename, dd, comment, True) remove(local) return True topurl = "https://clincalc.com/DrugStats/Top300Drugs.aspx" html = urllib2.urlopen(topurl).read() soup = BeautifulSoup(html) drugs = soup.findAll('a', href=re.compile('Drugs/.*')) print Fore.GREEN + NOTICE print '*'*80 print Fore.CYAN, soup.find('meta', {'name':'description'})['content'] print Fore.CYAN, "Drugs found", len(drugs), Fore.WHITE count = 0 '''for drug in drugs: count += 1 upthisdrug(drug)''' chars="abcdefghijklmnopqrstuvwxyz";charx=[];charxx=[] for i in range(0, len(chars)): charx.append(chars[i]) for a in charx: for b in charx: for c in charx:
def get_simbad_literature_refs(self, srcid_class_match_dict): """ Query and retrieve SIMBAD literature references for Simbad matched ASAS sources. """ import urllib from BeautifulSoup import BeautifulSoup, Comment litrefs_init_fpath = '/home/dstarr/scratch/determine_simbad__orig/src_litrefs.pkl' if os.path.exists(litrefs_init_fpath): fp = open(litrefs_init_fpath, 'rb') src_litrefs = cPickle.load(fp) fp.close() else: src_litrefs = {} srcid_list = srcid_class_match_dict.keys() srcid_list.sort() n_srcid = len(srcid_list) for i, src_id in enumerate(srcid_list): src_dict = srcid_class_match_dict[src_id] if src_dict['main_id'].count(' ') > 0: ### Source names return random literature results if ' ' is not replaced with '+' src_litrefs[src_id] = {} else: ### This assumes that src_litrefs[src_id] has been filled previously continue # we skip re-retrieving this source. src_name = src_dict['main_id'].replace(' ', '+') #url_str_new = "http://simbad.u-strasbg.fr/simbad/sim-id?Ident=%s&NbIdent=1&Radius=2&Radius.unit=arcmin&submit=submit+id" % (src_name) url_str = "http://adsabs.harvard.edu/cgi-bin/nph-abs_connect?db_key=AST&db_key=PRE&qform=AST&arxiv_sel=astro-ph&arxiv_sel=cond-mat&arxiv_sel=cs&arxiv_sel=gr-qc&arxiv_sel=hep-ex&arxiv_sel=hep-lat&arxiv_sel=hep-ph&arxiv_sel=hep-th&arxiv_sel=math&arxiv_sel=math-ph&arxiv_sel=nlin&arxiv_sel=nucl-ex&arxiv_sel=nucl-th&arxiv_sel=physics&arxiv_sel=quant-ph&arxiv_sel=q-bio&sim_query=YES&ned_query=YES&adsobj_query=YES&obj_req=YES&aut_logic=OR&obj_logic=OR&author=&object=%s&start_mon=&start_year=&end_mon=&end_year=&ttl_logic=OR&title=&txt_logic=OR&text=&nr_to_return=200&start_nr=1&jou_pick=ALL&ref_stems=&data_and=ALL&group_and=ALL&start_entry_day=&start_entry_mon=&start_entry_year=&end_entry_day=&end_entry_mon=&end_entry_year=&min_score=&sort=SCORE&data_type=SHORT&aut_syn=YES&ttl_syn=YES&txt_syn=YES&aut_wt=1.0&obj_wt=1.0&ttl_wt=0.3&txt_wt=3.0&aut_wgt=YES&obj_wgt=YES&ttl_wgt=YES&txt_wgt=YES&ttl_sco=YES&txt_sco=YES&version=1" % ( src_name) #(src_dict['main_id']) f_url = urllib.urlopen(url_str) webpage_str = f_url.read() f_url.close() soup = BeautifulSoup(webpage_str) comments = soup.findAll( text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] #print soup.html.body.form.find('table') #print '------------' #print soup.html.body.form.findAll('table')[1].table.tbody.findAll('tr') #soup.html.body.form.findAll('table')[1].extract() #bib_rows = soup.html.body.form.fetch('table')[1].fetch('tr') #print soup try: bib_rows = soup.html.body.form('table', limit=2)[1]('tr') print 'parsed:', i, n_srcid, src_id, src_dict['main_id'] except: # likely no results returned #print 'len(soup.html.body.form.table):', len(soup.html.body.form.table) print 'skip: ', i, n_srcid, src_id, src_dict['main_id'] continue for r in bib_rows: for td in r('td'): x = td.input if x == None: continue bibcode = x['value'] abstract_url = td.a['href'] # NOTE: I could probably extract some author names, title src_litrefs[src_id][bibcode] = abstract_url #print bibcode, abstract_url #import pdb; pdb.set_trace() #print #fp = open('/tmp/124', 'w') #fp.write(webpage_str) #fp.close() #import pdb; pdb.set_trace() #print #elemtree = ElementTree.fromstring(webpage_str) #xmld_data = xmldict.ConvertXmlToDict(elemtree) #b = xmld_data['HTML']['body']['form'] if (i % 500) == 0: fp = open('/tmp/src_litrefs_%d.pkl' % (i), 'wb') cPickle.dump(src_litrefs, fp, 1) # ,1) means a binary pkl is used. fp.close() import pdb pdb.set_trace() print fp = open('/tmp/src_litrefs.pkl', 'wb') cPickle.dump(src_litrefs, fp, 1) # ,1) means a binary pkl is used. fp.close()
from BeautifulSoup import BeautifulSoup from urllib2 import urlopen import random url = "https://quotes.yourdictionary.com/theme/marriage/" #obavezno slash treba staviti na kraj response = urlopen(url).read() soup = BeautifulSoup(response) quotes = [] for quote in soup.findAll("p", attrs={"class": "quoteContent"}): #trazi quotove klase quoteContent quotes.append(quote.string) # apendaj stringove iz quotova u listu quotes #print len(quotes) available_quotes = [] while len(available_quotes) < 5: # treba ispisati samo 5 random quotova random_quote_index = random.randint(0, len(quotes) - 1) # random_quote_index je int; 29 quotova, pocinje brojati od 0 pa je max range - 1 random_quote = quotes[random_quote_index] # nova varijabla = trazimo quote na nekom random rednom broju u listi quotes if random_quote_index not in available_quotes: # kako ne duplicirati iste iteme iz liste! available_quotes.append(random_quote_index) # appenda samo distinct vrijednosti print "- " + random_quote
class ScholarParser(): """ ScholarParser can parse HTML document strings obtained from Google Scholar. It invokes the handle_article() callback on each article that was parsed successfully. """ SCHOLAR_SITE = 'http://scholar.google.com' def __init__(self, site=None): self.soup = None self.article = None self.site = site or self.SCHOLAR_SITE def handle_article(self, art): """ In this base class, the callback does nothing. """ def parse(self, html): """ This method initiates parsing of HTML content. """ self.soup = BeautifulSoup(html) for div in self.soup.findAll(ScholarParser._tag_checker): self._parse_article(div) def _parse_article(self, div): self.article = Article() for tag in div: if not hasattr(tag, 'name'): continue if tag.name == 'div' and tag.get('class') == 'gs_rt' and \ tag.h3 and tag.h3.a: self.article['title'] = ''.join(tag.h3.a.findAll(text=True)) self.article['url'] = self._path2url(tag.h3.a['href']) if tag.name == 'font': for tag2 in tag: if not hasattr(tag2, 'name'): continue if tag2.name == 'span' and tag2.get('class') == 'gs_fl': self._parse_links(tag2) if self.article['title']: self.handle_article(self.article) def _parse_links(self, span): for tag in span: if not hasattr(tag, 'name'): continue if tag.name != 'a' or tag.get('href') == None: continue if tag.get('href').startswith('/scholar?cites'): if hasattr(tag, 'string') and tag.string.startswith('Cited by'): self.article['num_citations'] = \ self._as_int(tag.string.split()[-1]) self.article['url_citations'] = self._path2url(tag.get('href')) if tag.get('href').startswith('/scholar?cluster'): if hasattr(tag, 'string') and tag.string.startswith('All '): self.article['num_versions'] = \ self._as_int(tag.string.split()[1]) self.article['url_versions'] = self._path2url(tag.get('href')) @staticmethod def _tag_checker(tag): if tag.name == 'div' and tag.get('class') == 'gs_r': return True return False def _as_int(self, obj): try: return int(obj) except ValueError: return None def _path2url(self, path): if path.startswith('http://'): return path if not path.startswith('/'): path = '/' + path return self.site + path
# Blank Python school_list = ['alabama-a--and--m-university','university-of-alabama----birmingham','university-of-alabama----huntsville','auburn-university','university-of-north-alabama','samford-university','University-of-South-Alabama','columbia-southern-university','alabama-state-university','auburn-university----montgomery','jacksonville-state-university','university-of-montevallo','oakwood-university','university-of-alabama','birmingham--southern-college','troy-university','tuskegee-university','university-of-alaska-anchorage','university-of-alaska-fairbanks','grand-canyon-university','university-of-advancing-technology','arizona-state-university','university-of-arizona','embry--riddle-aeronautical-university----prescott','northern-arizona-university','university-of-phoenix----phoenix--hohokam','university-of-phoenix----southern-arizona','university-of-arkansas-at-little-rock','university-of-arkansas','harding-university','arkansas-state-university','philander-smith-college','arkansas-tech-university','hendrix-college','university-of-the-ozarks','california-baptist-university','california-institute-of-technology','california-lutheran-university','california-state-university----stanislaus','california-state-university----san-bernardino','california-state-university----chico','california-state-university----dominguez-hills','california-state-university----fresno','california-state-university----fullerton','california-state-university----northridge','university-of-california----davis','university-of-california----irvine','university-of-california----los-angeles','university-of-california----santa-barbara','university-of-la-verne','loyola-marymount-university','mills-college','occidental-college','university-of-the-pacific','pepperdine-university','pomona-college','university-of-redlands','san-diego-state-university','university-of-san-francisco','san-jose-state-university','scripps-college','sonoma-state-university','vanguard-university-of-southern-california','university-of-southern-california','california-state-university----san-marcos','california-polytechnic-state-university----san-luis-obispo','california-state-university----east-bay','california-state-university----sacramento','university-of-california----riverside','university-of-california----san-diego','university-of-california----santa-cruz','chapman-university','claremont-mckenna-college','harvey-mudd-college','humboldt-state-university','pitzer-college','university-of-san-diego','santa-clara-university','stanford-university','azusa-pacific-university','biola-university','california-state-university----bakersfield','california-state-polytechnic-university----pomona','california-state-university----long-beach','california-state-university----los-angeles','university-of-california----berkeley','pacific-union-college','point-loma-nazarene-university','san-francisco-state-university','whittier-college','california-state-university----monterey-bay','university-of-colorado-denver','university-of-colorado-at-colorado-springs','university-of-colorado----boulder','colorado-school-of-mines','colorado-state-university','university-of-denver','colorado-mesa-university','metropolitan-state-college-of-denver','colorado-state-university----pueblo','colorado-college','university-of-northern-colorado','regis-university','university-of-bridgeport','fairfield-university','university-of-hartford','university-of-new-haven','quinnipiac-university','sacred-heart-university','southern-connecticut-state-university','trinity-college','western-connecticut-state-university','yale-university','connecticut-college','university-of-connecticut','wesleyan-university','central-connecticut-state-university','delaware-state-university','university-of-delaware','american-university','catholic-university-of-america','george-washington-university','howard-university','georgetown-university','university-of-the-district-of-columbia','barry-university','bethune--cookman-university','university-of-central-florida','eckerd-college','embry--riddle-aeronautical-university----daytona-beach','florida-a-and-m-university','florida-international-university','florida-southern-college','florida-state-university','university-of-miami','university-of-north-florida','nova-southeastern-university','rollins-college','university-of-south-florida','stetson-university','university-of-tampa','new-college-of-florida','jacksonville-university','saint-leo-university','southeastern-university----florida','university-of-west-florida','ave-maria-university','south-university----tampa','florida-atlantic-university','florida-institute-of-technology','university-of-florida','florida-gulf-coast-university','albany-state-university','augusta-state-university','columbus-state-university','emory-university','fort-valley-state-university','georgia-institute-of-technology','georgia-college--and--state-university','university-of-georgia','north-georgia-college--and--state-university','valdosta-state-university','georgia-gwinnett-college','agnes-scott-college','armstrong-atlantic-state-university','clark-atlanta-university','berry-college','covenant-college','georgia-southern-university','georgia-state-university','wesleyan-college','university-of-west-georgia','clayton-state-university','kennesaw-state-university','mercer-university','morehouse-college','oglethorpe-university','savannah-state-university','spelman-college','southern-polytechnic-state-university','university-of-hawaii-at-hilo','university-of-hawaii-at-manoa','hawaii-pacific-university','boise-state-university','idaho-state-university','brigham-young-university----idaho','university-of-idaho','augustana-college----rock-island','university-of-chicago','columbia-college-chicago','elmhurst-college','illinois-wesleyan-university','illinois-state-university','millikin-university','northwestern-university','northeastern-illinois-university','olivet-nazarene-university','southern-illinois-university-carbondale','southern-illinois-university-edwardsville','western-illinois-university','bradley-university','depaul-university','eastern-illinois-university','illinois-institute-of-technology','lewis-university','mckendree-university','north-central-college','northern-illinois-university','roosevelt-university','wheaton-college----illinois','aurora-university','chicago-state-university','university-of-illinois-at-chicago','university-of-illinois-at-urbana--champaign','loyola-university-chicago','anderson-university----indiana','butler-university','earlham-college','hanover-college','indiana-university--purdue-university----fort-wayne','university-of-indianapolis','university-of-southern-indiana','indiana-university','indiana-wesleyan-university','purdue-university----calumet','saint-marys-college','valparaiso-university','purdue-university','ball-state-university','indiana-state-university','depauw-university','indiana-university----south-bend','university-of-notre-dame','taylor-university','coe-college','drake-university','grinnell-college','university-of-iowa','dordt-college','luther-college','university-of-northern-iowa','cornell-college','iowa-state-university','kansas-state-university','emporia-state-university','university-of-kansas','wichita-state-university','bellarmine-university','berea-college','university-of-kentucky'] #print list[0] for thisSchool in school_list: #print thisSchool try: page = urllib2.urlopen("http://collegeprowler.com/" + thisSchool + "/diversity/student-polls/") soup = BeautifulSoup(page) table_five = soup.findAll("table")[4] politics_label = table_five.findAll("td")[1].string #label #print politics_label politics_value = table_five.findAll("td")[0].string #value print politics_value except (IndexError): print "Oops! Try again..." scraperwiki.sqlite.save
def test_3_download_jenkins_job(selfself): mode_start_time=time.time() # Create destination directory destination_dir = 'Jenkins_Job_Files' destination_dir = os.path.join(os.path.dirname(os.path.abspath('.')), destination_dir) if os.path.exists(destination_dir): shutil.rmtree(destination_dir) os.mkdir(destination_dir) #Import BeautifulSoup try: from BeautifulSoup import BeautifulSoup except Exception as e: print_in_color(str(e), 'red') print_in_color('Execute "pip install beautifulsoup" to install it!', 'yellow') exit('Install beautifulsoup and rerun!') # Download logs response = urllib2.urlopen(artifact_url) html = response.read() parsed_url = urlparse.urlparse(artifact_url) base_url = parsed_url.scheme + '://' + parsed_url.netloc soup = BeautifulSoup(html) tar_gz_files = [] ir_logs_urls = [] tempest_log_url = None for link in soup.findAll('a'): if 'tempest-results' in link: tempest_results_url = urljoin(artifact_url, link.get('href')) tempest_response = urllib2.urlopen(tempest_results_url) html = tempest_response.read() soup = BeautifulSoup(html) for link in soup.findAll('a'): if str(link.get('href')).endswith('.html'): tempest_html = link.get('href') tempest_log_url = urljoin(artifact_url, 'tempest-results') + '/' + tempest_html break if str(link.get('href')).endswith('.tar.gz'): tar_gz_files.append(link) tar_link = urlparse.urljoin(artifact_url, link.get('href')) os.system('wget -P ' + destination_dir + ' ' + tar_link) if str(link.get('href')).endswith('.sh'): sh_page_link = urlparse.urljoin(artifact_url, link.get('href')) response = urllib2.urlopen(sh_page_link) html = response.read() soup = BeautifulSoup(html) for link in soup.findAll('a'): if str(link.get('href')).endswith('.log'): ir_logs_urls.append(sh_page_link + '/' + link.get('href')) # Download console.log console_log_url=artifact_url.strip().replace('artifact','consoleFull').strip('/') print console_log_url os.system('wget -P ' + destination_dir + ' ' + console_log_url) shutil.move(os.path.join(destination_dir, 'consoleFull'),os.path.join(destination_dir,'consoleFull.log')) # Download Infared Logs .sh, files in .sh directory on Jenkins if len(ir_logs_urls)!=0: for url in ir_logs_urls: os.system('wget -P ' + destination_dir + ' ' + url) # Download tempest log (html #) if tempest_log_url!=None: os.system('wget -P ' + destination_dir + ' ' + tempest_log_url) shutil.move(os.path.join(destination_dir, tempest_html),os.path.join(destination_dir,tempest_html.replace('.html','.log'))) # Unzip all downloaded .tar.gz files for fil in os.listdir(os.path.abspath(destination_dir)): if fil.endswith('.tar.gz'): cmd = 'tar -zxvf ' + os.path.join(os.path.abspath(destination_dir), fil) + ' -C ' + os.path.abspath( destination_dir) + ' >/dev/null' + ';' + 'rm -rf ' + os.path.join( os.path.abspath(destination_dir), fil) print_in_color('Unzipping ' + fil + '...', 'bold') os.system(cmd) os.system('rm -rf '+fil) # Run LogTool analyzing print_in_color('\nStart analyzing downloaded OSP logs locally', 'bold') result_dir = 'Jenkins_Job_' + grep_string.replace(' ', '') if os.path.exists(os.path.abspath(result_dir)): shutil.rmtree(os.path.abspath(result_dir)) result_file = os.path.join(os.path.abspath(result_dir), 'LogTool_Result_' + grep_string.replace(' ', '') + '.log') print artifact_url print user_tart_time print undercloud_logs command = "python2 Extract_On_Node.py '" +user_tart_time+ "' " + os.path.abspath( destination_dir) + " '" + grep_string + "'" + ' ' + result_file # shutil.copytree(destination_dir, os.path.abspath(result_dir)) exec_command_line_command('cp -r ' + destination_dir + ' ' + os.path.abspath(result_dir)) print_in_color('\n --> ' + command, 'bold') com_result = exec_command_line_command(command) # print (com_result['CommandOutput']) end_time = time.time() if com_result['ReturnCode'] == 0: spec_print(['Completed!!!', 'You can find the result file + downloaded logs in:', 'Result Directory: ' + result_dir, 'Analyze logs execution time: ' + str(round(end_time - mode_start_time, 2)) + '[sec]'], 'green') else: spec_print(['Completed!!!', 'Result Directory: ' + result_dir, 'Analyze logs execution time: ' + str(round(end_time - mode_start_time, 2)) + '[sec]'], 'red') def test_4_create_final_report(self): print('\ntest_3_create_final_report') report_file_name = 'LogTool_Report.log' if report_file_name in os.listdir('.'): os.remove(report_file_name) report_data='' for key in workers_output: if 'Total_Number_Of_Errors:0' not in workers_output[key]: report_data+='\n'+key+' --> '+workers_output[key] if len(report_data)!=0: append_to_file(report_file_name,report_data+ '\n\nFor more details, check LogTool result files on your setup:' '\n'+os.path.abspath(result_dir))
def get_simbad_abstracts(self, srcid_class_match_dict): """ Query and retrieve SIMBAD abstracts, using previously retrieved literature references for Simbad matched ASAS sources. """ import urllib from BeautifulSoup import BeautifulSoup, Comment fp = open('/tmp/src_litrefs.pkl', 'rb') src_litrefs = cPickle.load(fp) fp.close() abs_bibcodes_dirpath = '/home/obs/scratch/determine_simbad' abs_bibcodes = os.listdir(abs_bibcodes_dirpath) abstracts_pkl_dirpath = '/home/obs/scratch/determine_simbad_abstracts.pkl' if os.path.exists(abstracts_pkl_dirpath): fp = open(abstracts_pkl_dirpath, 'rb') abstracts_dict = cPickle.load(fp) fp.close() else: abstracts_dict = {} srcid_list = src_litrefs.keys() srcid_list.sort() for i, src_id in enumerate(srcid_list): src_bib_dict = src_litrefs[src_id] for bibcode, abstract_url in src_bib_dict.iteritems(): if abstracts_dict.has_key(bibcode): continue # skip since we parsed this already fpath = "%s/%s" % (abs_bibcodes_dirpath, bibcode.replace('/', '___')) # TODO: need to check that we have not parsed and place in dict if not bibcode in abs_bibcodes: f_url = urllib.urlopen(abstract_url) webpage_str = f_url.read() f_url.close() fp = open(fpath, 'w') fp.write(webpage_str) fp.close() else: fp = open(fpath) webpage_str = fp.read() fp.close() soup = BeautifulSoup(webpage_str) comments = soup.findAll( text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] #print soup.html.body('p', limit=2)[1]('table', limit=2)[1].prettify() #import pdb; pdb.set_trace() #print try: abstract_rows = soup.html.body('p', limit=2)[1]( 'table', limit=2)[1]('tr') except: print "skipping:", bibcode continue for r in abstract_rows: if 'Title:' in str(r('td')[0].getText()): title = r('td')[2].getText() # in UNICODE elif 'Authors:' in str(r('td')[0].getText()): authors = r('td')[2].getText() # in UNICODE elif 'Publication:' in str(r('td')[0].getText()): publication = r('td')[2].getText() # in UNICODE elif 'Publication Date:' in str(r('td')[0].getText()): publication_date = r('td')[2].getText() # in UNICODE elif 'Keywords:' in str(r('td')[0].getText()): keywords = r('td')[2].getText() # in UNICODE #print "title:%s \nauthors:%s \npub:%s \ndate:%s \nkeywords:%s\n" % (title, authors, publication, publication_date, keywords) print i, src_id, bibcode, title[:60] abstracts_dict[bibcode] = { 'title': title, 'authors': authors, 'publication': publication, 'pub_date': publication_date, 'keywords': keywords, } if os.path.exists(abstracts_pkl_dirpath): os.system('rm ' + abstracts_pkl_dirpath) fp = open(abstracts_pkl_dirpath, 'wb') cPickle.dump(abstracts_dict, fp, 1) fp.close() import pdb pdb.set_trace() print
def test_email_diff_subtitles(self): initial_count = len(mail.outbox) # set a user who can receive notification # make sure we have a different author, else he won't get notified author = User(username='******', email='*****@*****.**', notify_by_email=True, valid_email=True) author.save(send_email_confirmation=False) # bypass logic from hell author.valid_email = True author.save() # this is needed for the non_editor template check user2 = User(username='******', email='*****@*****.**', notify_by_email=True, valid_email=True) user2.save(send_email_confirmation=False) # bypass logic from hell user2.valid_email = True user2.save() # version is indentical to previous one video = Video.get_or_create_for_url( "http://wwww.example.com/video-diff.mp4")[0] video.followers.add(author) video.followers.add(user2) language = SubtitleLanguage(video=video, language_code='en') language.save() subs_data = [ [0, 1000, '1'], [1000, 2000, '2'], ] subtitles_1 = SubtitleSet.from_list('en', subs_data) old_version = language.add_version(subtitles=subtitles_1, author=author) # now we change the text on the second sub subs_data[1][2] = '2 changed' # add a regular sub subs_data.append([2000, 3000, 'new sub']) # add an unsyced subs_data.append([None, None, 'no sync']) subtitles_2 = SubtitleSet.from_list('en', subs_data) new_version = language.add_version(subtitles=subtitles_2) self.assertTrue(len(video.notification_list()) > 0) res = send_new_version_notification(new_version.pk) self.assertNotEqual(res, None) # we expect two emails, one is the new-edits-non-editor, and # the other for mail_notification.html self.assertEqual(len(mail.outbox), initial_count + 2) for email_number, email_msg in enumerate(mail.outbox): # make sure this is the right message self.assertIn("New edits to ", email_msg.subject) self.assertIn("video-diff.mp4", email_msg.subject) html = BeautifulSoup(email_msg.body) html_text = "".join(html.body(text=True)).replace("\n", "") if email_number == 0: # assert text and timing changes are correct self.assertIn('67% of the text', html_text) self.assertIn('33% of the timing was changed.', html_text) # find the listed text changes to make sure they match diff_table = html.findAll('table', attrs={'class': 'diffs'})[0] old_version_changes = [] new_version_changes = [] for i, node in enumerate(diff_table.findAll('td')): if i % 2 == 0: old_version_changes.append(node.text) else: new_version_changes.append(node.text) self.assertEqual(old_version_changes, [u'2', u'', u'']) self.assertEqual(new_version_changes, [ u'2 changed', u'new sub', u'no sync', ])
br.select_form(nr=0) # Input the info br.form['npa'] = phonenum[:3] br.form['nxx'] = phonenum[3:6] br.form['thoublock'] = phonenum[6] # Find Data br.submit() # Process Data html = br.response().read() soup = BeautifulSoup(html) # Remove HTML Tags text_parts = soup.findAll(text=True) text = ' '.join(text_parts) # Regex Applicable Data prov_name = re.findall('[0-9]+ [0-9]+ \w+ \w+ \w+.+PROV', text) # Convert List to String L = [str(x) for x in prov_name] s = string.join(L, ' ') # Display Server Feedback print "[ ] FoneFinder Server Feedback: " + s # Provider Selection Menu phoneHash = { "1": "Teleflip",
"&WebPageNr=1&WebAction=NewSearch" html0 = scraperwiki.scrape(starting_url) m = re.search("ListBody.csp\?[^\"]+", html0) if m: list_url = baseurl + m.group(0) else: import sys sys.exit(0) html = scraperwiki.scrape(list_url) #print html soup = BeautifulSoup(html) # use BeautifulSoup to get all <td> tags tds = soup.findAll("td", attrs={"class": re.compile("listitem(Odd|Even)")}) oldurl = "" for td in tds: if td.find("a"): url = unicode(td.find("a")["href"]) if url == oldurl: continue oldurl = url #print baseurl + url html2 = scraperwiki.scrape(baseurl + url.replace(" ", "%20")) #print html2 m = re.search("FullBBBody.csp\?[^\"]+", html2) if m: url2 = m.group(0) html3 = scraperwiki.scrape(baseurl + url2)
class HansardParser(object): """Base class for Hansard parsers""" def __init__(self, hansard, html): super(HansardParser, self).__init__() self.hansard = hansard for regex in STARTUP_RE: html = re.sub(regex[0], regex[1], html) self.soup = BeautifulSoup(html, convertEntities='html') # remove comments for t in self.soup.findAll(text=lambda x: isinstance(x, Comment)): t.extract() def parse(self): self.statements = [] self.statement_index = 0 def houseTime(self, number, ampm): ampm = ampm.replace('.', '') number = number.replace('.', ':') match = re.search(r'(\d+):(\d+)', number) if match: # "2:30 p.m." return datetime.datetime.strptime( "%s:%s %s" % (match.group(1), match.group(2), ampm), "%I:%M %p").time() else: # "2 p.m." return datetime.datetime.strptime("%s %s" % (number, ampm), "%I %p").time() def saveProceedingsStatement(self, text, t): text = parsetools.sane_quotes(parsetools.tameWhitespace(text.strip())) if len(text): timestamp = t['timestamp'] if not isinstance(timestamp, datetime.datetime): # The older parser provides only datetime.time objects timestamp = datetime.datetime.combine(self.date, timestamp) statement = Statement(hansard=self.hansard, time=timestamp, text=text, sequence=self.statement_index, who='Proceedings') self.statement_index += 1 self.statements.append(statement) def saveStatement(self, t): def mcUp(match): return 'Mc' + match.group(1).upper() if t['topic']: # Question No. 139-- -> Question No. 139 t['topic'] = re.sub(r'\-+$', '', t['topic']) t['topic'] = re.sub(r"'S", "'s", t['topic']) t['topic'] = re.sub(r'Mc([a-z])', mcUp, t['topic']) if t.hasText(): if not t['member_title']: t['member_title'] = 'Proceedings' print "WARNING: No title for %s" % t.getText().encode( 'ascii', 'replace') timestamp = t['timestamp'] if not isinstance(timestamp, datetime.datetime): # The older parser provides only datetime.time objects timestamp = datetime.datetime.combine(self.date, timestamp) statement = Statement(hansard=self.hansard, heading=t['heading'], topic=t['topic'], time=timestamp, member=t['member'], politician=t['politician'], who=t['member_title'], text=t.getText(), sequence=self.statement_index, written_question=bool(t['written_question'])) if r_notamember.search(t['member_title'])\ and ('Speaker' in t['member_title'] or 'The Chair' in t['member_title']): statement.speaker = True self.statement_index += 1 self.statements.append(statement) if ENABLE_PRINT: print u"HEADING: %s" % t['heading'] print u"TOPIC: %s" % t['topic'] print u"MEMBER TITLE: %s" % t['member_title'] print u"MEMBER: %s" % t['member'] print u"TIME: %s" % t['timestamp'] print u"TEXT: %s" % t.getText() if ENABLE_READLINE: sys.stdin.readline() t.onward()