def make_emoji_img_elem(emoji_span_elem: CSSSelector) -> Dict[str, Any]: # Convert the emoji spans to img tags. classes = emoji_span_elem.get('class') match = re.search(r'emoji-(?P<emoji_code>\S+)', classes) # re.search is capable of returning None, # but since the parent function should only be called with a valid css element # we assert that it does not. assert match is not None emoji_code = match.group('emoji_code') emoji_name = emoji_span_elem.get('title') alt_code = emoji_span_elem.text image_url = base_url + '/static/generated/emoji/images-%(emojiset)s-64/%(emoji_code)s.png' % { 'emojiset': emojiset, 'emoji_code': emoji_code } img_elem = lxml.html.fromstring( '<img alt="%(alt_code)s" src="%(image_url)s" title="%(title)s">' % { 'alt_code': alt_code, 'image_url': image_url, 'title': emoji_name, }) img_elem.set('style', 'height: 20px;') img_elem.tail = emoji_span_elem.tail return img_elem
def get_selected_items(self): response = self.session.get(self.url('selected_items')) tree = lxml.html.fromstring(response.text) item_sel = CSSSelector('div[headers="th_selected_items"]') name_sel = CSSSelector('h4.il_ContainerItemTitle') icon_sel = CSSSelector('img.ilListItemIcon') results = item_sel(tree) for result in results: item = Item() name = name_sel(result)[0] try: name = CSSSelector('a')(name)[0] except IndexError: pass item.name = name.text item.url = name.get('href') icon = icon_sel(result)[0] item.icon = icon.get('src') yield item
def get_selected_items(self): response = self.session.get(self.url("selected_items")) tree = lxml.html.fromstring(response.text) item_sel = CSSSelector('div[headers="th_selected_items"]') name_sel = CSSSelector("h4.il_ContainerItemTitle") icon_sel = CSSSelector("img.ilListItemIcon") results = item_sel(tree) for result in results: item = Item() name = name_sel(result)[0] try: name = CSSSelector("a")(name)[0] except IndexError: pass item.name = name.text item.url = name.get("href") icon = icon_sel(result)[0] item.icon = icon.get("src") yield item
def make_emoji_img_elem(emoji_span_elem: CSSSelector) -> Dict[str, Any]: # Convert the emoji spans to img tags. classes = emoji_span_elem.get("class") match = re.search(r"emoji-(?P<emoji_code>\S+)", classes) # re.search is capable of returning None, # but since the parent function should only be called with a valid css element # we assert that it does not. assert match is not None emoji_code = match.group("emoji_code") emoji_name = emoji_span_elem.get("title") alt_code = emoji_span_elem.text image_url = base_url + f"/static/generated/emoji/images-{emojiset}-64/{emoji_code}.png" img_elem = lxml.html.fromstring( f'<img alt="{alt_code}" src="{image_url}" title="{emoji_name}">') img_elem.set("style", "height: 20px;") img_elem.tail = emoji_span_elem.tail return img_elem
def make_emoji_img_elem(emoji_span_elem: CSSSelector) -> Dict[str, Any]: # Convert the emoji spans to img tags. classes = emoji_span_elem.get('class') match = re.search('emoji-(?P<emoji_code>\S+)', classes) # re.search is capable of returning None, # but since the parent function should only be called with a valid css element # we assert that it does not. assert match is not None emoji_code = match.group('emoji_code') emoji_name = emoji_span_elem.get('title') alt_code = emoji_span_elem.text image_url = base_url + '/static/generated/emoji/images-%(emojiset)s-64/%(emoji_code)s.png' % { 'emojiset': emojiset, 'emoji_code': emoji_code } img_elem = lxml.html.fromstring( '<img alt="%(alt_code)s" src="%(image_url)s" title="%(title)s">' % { 'alt_code': alt_code, 'image_url': image_url, 'title': emoji_name, }) img_elem.set('style', 'height: 20px;') img_elem.tail = emoji_span_elem.tail return img_elem
def load_stations(file="stations-converted.json"): global STATIONS with open(file) as f: STATIONS = anyjson.deserialize(f.read()) for station in STATIONS.values(): try: uri = "http://hydro.chmi.cz/isarrow/object.php?seq=2000855701&chemie=1&biota=1&ukol_p=1&id_objekt=&vod_typ=R&nadmh_sign=%3E&rickm_sign=%3E&rok_od=2007&rok_do=2012&objekty_chemdata=1&matrice=2000868184&typodb=41" seq = CSSSelector("form input[name='seq']")(fromstring(urllib2.urlopen(uri).read().decode("cp1250")))[ 0 ].value # print 'seq is ' + seq uri = ( "http://hydro.chmi.cz/isarrow/object.php?agenda=POV&objekty_chemdata=1&objekty_biodata=&taxon_tree=&seq=" + seq + "&data_sel=chemdata&chemie=1&biota=1&rok_od=2007&rok_do=2012&matrice=2000868184&typodb=41&tscongrp=&tscon=&data_mez_stanovitelnosti=&data_od=&data_do=&taxon=&send=Chemick%E9+vzorky" ) tree = fromstring(urllib2.urlopen(uri).read().decode("cp1250")) link = CSSSelector("table.tbl a")(tree)[-1] uri = "http://hydro.chmi.cz/isarrow/" + link.get("href") tree = fromstring(urllib2.urlopen(uri).read().decode("cp1250")) csv_link = tree.xpath("//form[1]//a")[0] uri = "http://hydro.chmi.cz/isarrow/" + csv_link.get("href") # FIXME: CSV export is now broken on IS ARROW # wait for them to fix it or parse from table -- and store relevant data into structure reader = csv.reader(urllib2.urlopen(uri)) for row in reader: print row except Exception: print "Failed to retrieve values for station " + station["id"] import traceback traceback.print_exc()
def get_random_anime(self, genre='All', excluded_anime=''): """Returns an AnimeMetaObject for a random anime""" params = urllib.urlencode({ 'selectGenre': genre, 'excludedAnime': excluded_anime }) content = self.conn.scrape.post('%sGetRandomAnime?%s' % (BASE_URL, params)) tree = lxml.html.fromstring(content.text) paragraphs = CSSSelector('p')(tree) if paragraphs is None or len(paragraphs) < 2: return None bigchar = CSSSelector('.bigChar')(tree) # Holds title and URL if bigchar is None or len(bigchar) < 1: return None bigchar = bigchar[0] description = paragraphs[1].get('title') tags = [tag.text for tag in paragraphs[0].cssselect('a')] # Tags a's title = bigchar.text url = BASE_URL[:-1] + bigchar.get('href') return AnimeMetaObject(title, url, tags, description)
def _get_post_details(post_listing): """Scrape a post and return as a Post object.""" title_node = CSSSelector("a.topictitle")(post_listing)[0] title = title_node.text_content() url = _forum_url + title_node.get("href")[2:] if rp.can_fetch("*", url): print "Scraping post: " + title post_page = lxml.html.fromstring(_get_page(url)) author = _get_post_author(post_page) content = _get_post_content(post_page) images = _get_post_images(post_page) privateMessageLink = _get_private_message_link(post_page) return Post(title, author, url, content, images, privateMessageLink) else: _robots_not_allowed(url) return None
sel = CSSSelector('table tbody tr') rows = sel(tree) print "Row results: ", len(rows) num_operating = 0 for row in rows: # This is unrealiable; I don't know how to get just the text 'Operating': # phase = CSSSelector('td:nth-of-type(4)')(row)[0] # lxml.html.tostring(phase) # '<td><span class="hide">3</span>Operating</td>' phase = CSSSelector('td:nth-of-type(4)')(row)[0].text_content() # '3Operating' if 'Operating' in phase: num_operating += 1 # Show phase because we may have stale ones in iSat division = CSSSelector('td:nth-of-type(1)')(row)[0] try: division = division.text.strip() except AttributeError, e: division = 'NOTFOUND' mission = CSSSelector('td:nth-of-type(2) > a')(row)[0] mission_name = mission.text.strip() mission_url = mission.get('href') # /missions/xmm-newton/ mission_slug = mission_url.split('/')[2] num_operating += 1 try: print '%-30s\t%-40s\t%-20s\t%-20s' % (mission_slug, mission_name.encode('ascii', 'ignore'), division, phase) except UnicodeEncodeError, e: print "F*****g unicode problem: ", e import pdb; pdb.set_trace() print 'Operating:', num_operating
num_operating = 0 for row in rows: # This is unrealiable; I don't know how to get just the text 'Operating': # phase = CSSSelector('td:nth-of-type(4)')(row)[0] # lxml.html.tostring(phase) # '<td><span class="hide">3</span>Operating</td>' phase = CSSSelector('td:nth-of-type(4)')( row)[0].text_content() # '3Operating' if 'Operating' in phase: num_operating += 1 # Show phase because we may have stale ones in iSat division = CSSSelector('td:nth-of-type(1)')(row)[0] try: division = division.text.strip() except AttributeError, e: division = 'NOTFOUND' mission = CSSSelector('td:nth-of-type(2) > a')(row)[0] mission_name = mission.text.strip() mission_url = mission.get('href') # /missions/xmm-newton/ mission_slug = mission_url.split('/')[2] num_operating += 1 try: print '%-30s\t%-40s\t%-20s\t%-20s' % ( mission_slug, mission_name.encode('ascii', 'ignore'), division, phase) except UnicodeEncodeError, e: print "F*****g unicode problem: ", e import pdb pdb.set_trace() print 'Operating:', num_operating