def make_emoji_img_elem(emoji_span_elem: CSSSelector) -> Dict[str, Any]:
     # Convert the emoji spans to img tags.
     classes = emoji_span_elem.get('class')
     match = re.search(r'emoji-(?P<emoji_code>\S+)', classes)
     # re.search is capable of returning None,
     # but since the parent function should only be called with a valid css element
     # we assert that it does not.
     assert match is not None
     emoji_code = match.group('emoji_code')
     emoji_name = emoji_span_elem.get('title')
     alt_code = emoji_span_elem.text
     image_url = base_url + '/static/generated/emoji/images-%(emojiset)s-64/%(emoji_code)s.png' % {
         'emojiset': emojiset,
         'emoji_code': emoji_code
     }
     img_elem = lxml.html.fromstring(
         '<img alt="%(alt_code)s" src="%(image_url)s" title="%(title)s">' %
         {
             'alt_code': alt_code,
             'image_url': image_url,
             'title': emoji_name,
         })
     img_elem.set('style', 'height: 20px;')
     img_elem.tail = emoji_span_elem.tail
     return img_elem
예제 #2
0
    def get_selected_items(self):
        response = self.session.get(self.url('selected_items'))

        tree = lxml.html.fromstring(response.text)

        item_sel = CSSSelector('div[headers="th_selected_items"]')
        name_sel = CSSSelector('h4.il_ContainerItemTitle')
        icon_sel = CSSSelector('img.ilListItemIcon')

        results = item_sel(tree)

        for result in results:
            item = Item()

            name = name_sel(result)[0]

            try:
                name = CSSSelector('a')(name)[0]
            except IndexError:
                pass

            item.name = name.text
            item.url = name.get('href')

            icon = icon_sel(result)[0]
            item.icon = icon.get('src')

            yield item
예제 #3
0
    def get_selected_items(self):
        response = self.session.get(self.url("selected_items"))

        tree = lxml.html.fromstring(response.text)

        item_sel = CSSSelector('div[headers="th_selected_items"]')
        name_sel = CSSSelector("h4.il_ContainerItemTitle")
        icon_sel = CSSSelector("img.ilListItemIcon")

        results = item_sel(tree)

        for result in results:
            item = Item()

            name = name_sel(result)[0]

            try:
                name = CSSSelector("a")(name)[0]
            except IndexError:
                pass

            item.name = name.text
            item.url = name.get("href")

            icon = icon_sel(result)[0]
            item.icon = icon.get("src")

            yield item
예제 #4
0
 def make_emoji_img_elem(emoji_span_elem: CSSSelector) -> Dict[str, Any]:
     # Convert the emoji spans to img tags.
     classes = emoji_span_elem.get("class")
     match = re.search(r"emoji-(?P<emoji_code>\S+)", classes)
     # re.search is capable of returning None,
     # but since the parent function should only be called with a valid css element
     # we assert that it does not.
     assert match is not None
     emoji_code = match.group("emoji_code")
     emoji_name = emoji_span_elem.get("title")
     alt_code = emoji_span_elem.text
     image_url = base_url + f"/static/generated/emoji/images-{emojiset}-64/{emoji_code}.png"
     img_elem = lxml.html.fromstring(
         f'<img alt="{alt_code}" src="{image_url}" title="{emoji_name}">')
     img_elem.set("style", "height: 20px;")
     img_elem.tail = emoji_span_elem.tail
     return img_elem
예제 #5
0
 def make_emoji_img_elem(emoji_span_elem: CSSSelector) -> Dict[str, Any]:
     # Convert the emoji spans to img tags.
     classes = emoji_span_elem.get('class')
     match = re.search('emoji-(?P<emoji_code>\S+)', classes)
     # re.search is capable of returning None,
     # but since the parent function should only be called with a valid css element
     # we assert that it does not.
     assert match is not None
     emoji_code = match.group('emoji_code')
     emoji_name = emoji_span_elem.get('title')
     alt_code = emoji_span_elem.text
     image_url = base_url + '/static/generated/emoji/images-%(emojiset)s-64/%(emoji_code)s.png' % {
         'emojiset': emojiset,
         'emoji_code': emoji_code
     }
     img_elem = lxml.html.fromstring(
         '<img alt="%(alt_code)s" src="%(image_url)s" title="%(title)s">' % {
             'alt_code': alt_code,
             'image_url': image_url,
             'title': emoji_name,
         })
     img_elem.set('style', 'height: 20px;')
     img_elem.tail = emoji_span_elem.tail
     return img_elem
def load_stations(file="stations-converted.json"):
    global STATIONS

    with open(file) as f:
        STATIONS = anyjson.deserialize(f.read())

    for station in STATIONS.values():
        try:
            uri = "http://hydro.chmi.cz/isarrow/object.php?seq=2000855701&chemie=1&biota=1&ukol_p=1&id_objekt=&vod_typ=R&nadmh_sign=%3E&rickm_sign=%3E&rok_od=2007&rok_do=2012&objekty_chemdata=1&matrice=2000868184&typodb=41"
            seq = CSSSelector("form input[name='seq']")(fromstring(urllib2.urlopen(uri).read().decode("cp1250")))[
                0
            ].value

            # print 'seq is ' + seq

            uri = (
                "http://hydro.chmi.cz/isarrow/object.php?agenda=POV&objekty_chemdata=1&objekty_biodata=&taxon_tree=&seq="
                + seq
                + "&data_sel=chemdata&chemie=1&biota=1&rok_od=2007&rok_do=2012&matrice=2000868184&typodb=41&tscongrp=&tscon=&data_mez_stanovitelnosti=&data_od=&data_do=&taxon=&send=Chemick%E9+vzorky"
            )
            tree = fromstring(urllib2.urlopen(uri).read().decode("cp1250"))

            link = CSSSelector("table.tbl a")(tree)[-1]

            uri = "http://hydro.chmi.cz/isarrow/" + link.get("href")
            tree = fromstring(urllib2.urlopen(uri).read().decode("cp1250"))

            csv_link = tree.xpath("//form[1]//a")[0]

            uri = "http://hydro.chmi.cz/isarrow/" + csv_link.get("href")

            # FIXME: CSV export is now broken on IS ARROW
            # wait for them to fix it or parse from table -- and store relevant data into structure
            reader = csv.reader(urllib2.urlopen(uri))
            for row in reader:
                print row

        except Exception:
            print "Failed to retrieve values for station " + station["id"]
            import traceback

            traceback.print_exc()
예제 #7
0
 def get_random_anime(self, genre='All', excluded_anime=''):
     """Returns an AnimeMetaObject for a random anime"""
     params = urllib.urlencode({
         'selectGenre': genre,
         'excludedAnime': excluded_anime
     })
     content = self.conn.scrape.post('%sGetRandomAnime?%s' %
                                     (BASE_URL, params))
     tree = lxml.html.fromstring(content.text)
     paragraphs = CSSSelector('p')(tree)
     if paragraphs is None or len(paragraphs) < 2:
         return None
     bigchar = CSSSelector('.bigChar')(tree)  # Holds title and URL
     if bigchar is None or len(bigchar) < 1:
         return None
     bigchar = bigchar[0]
     description = paragraphs[1].get('title')
     tags = [tag.text for tag in paragraphs[0].cssselect('a')]  # Tags a's
     title = bigchar.text
     url = BASE_URL[:-1] + bigchar.get('href')
     return AnimeMetaObject(title, url, tags, description)
예제 #8
0
def _get_post_details(post_listing):
    """Scrape a post and return as a Post object."""

    title_node = CSSSelector("a.topictitle")(post_listing)[0]
    title = title_node.text_content()

    url = _forum_url + title_node.get("href")[2:]

    if rp.can_fetch("*", url):

        print "Scraping post: " + title

        post_page = lxml.html.fromstring(_get_page(url))

        author = _get_post_author(post_page)
        content = _get_post_content(post_page)
        images = _get_post_images(post_page)
        privateMessageLink = _get_private_message_link(post_page)

        return Post(title, author, url, content, images, privateMessageLink)
    else:
        _robots_not_allowed(url)
        return None
예제 #9
0
sel = CSSSelector('table tbody tr')
rows = sel(tree)
print "Row results: ", len(rows)
num_operating = 0
for row in rows:
    # This is unrealiable; I don't know how to get just the text 'Operating':
    #  phase = CSSSelector('td:nth-of-type(4)')(row)[0]
    #  lxml.html.tostring(phase)
    #  '<td><span class="hide">3</span>Operating</td>'
    phase = CSSSelector('td:nth-of-type(4)')(row)[0].text_content() # '3Operating'
    if 'Operating' in phase:
        num_operating += 1
    # Show phase because we may have stale ones in iSat
    division = CSSSelector('td:nth-of-type(1)')(row)[0]
    try:
        division = division.text.strip()
    except AttributeError, e:
        division = 'NOTFOUND'
    mission = CSSSelector('td:nth-of-type(2) > a')(row)[0]
    mission_name = mission.text.strip()
    mission_url = mission.get('href') # /missions/xmm-newton/
    mission_slug = mission_url.split('/')[2]
    num_operating += 1
    try:
        print '%-30s\t%-40s\t%-20s\t%-20s' % (mission_slug, mission_name.encode('ascii', 'ignore'), division, phase)
    except UnicodeEncodeError, e:
        print "F*****g unicode problem: ", e
        import pdb; pdb.set_trace()
print 'Operating:', num_operating

예제 #10
0
num_operating = 0
for row in rows:
    # This is unrealiable; I don't know how to get just the text 'Operating':
    #  phase = CSSSelector('td:nth-of-type(4)')(row)[0]
    #  lxml.html.tostring(phase)
    #  '<td><span class="hide">3</span>Operating</td>'
    phase = CSSSelector('td:nth-of-type(4)')(
        row)[0].text_content()  # '3Operating'
    if 'Operating' in phase:
        num_operating += 1
    # Show phase because we may have stale ones in iSat
    division = CSSSelector('td:nth-of-type(1)')(row)[0]
    try:
        division = division.text.strip()
    except AttributeError, e:
        division = 'NOTFOUND'
    mission = CSSSelector('td:nth-of-type(2) > a')(row)[0]
    mission_name = mission.text.strip()
    mission_url = mission.get('href')  # /missions/xmm-newton/
    mission_slug = mission_url.split('/')[2]
    num_operating += 1
    try:
        print '%-30s\t%-40s\t%-20s\t%-20s' % (
            mission_slug, mission_name.encode('ascii',
                                              'ignore'), division, phase)
    except UnicodeEncodeError, e:
        print "F*****g unicode problem: ", e
        import pdb
        pdb.set_trace()
print 'Operating:', num_operating