def _get_page(self, place): if place.lower() in self.places: place = self.places[place.lower()] soup = get_html_parse_tree( 'http://m.wund.com/cgi-bin/findweather/getForecast?' + urlencode({ 'brand': 'mobile_metric', 'query': place.encode('utf-8') })) if soup.body.center and soup.body.center.b.string == 'Search not found:': raise Weather.WeatherException(u'City not found') if soup.table.tr.th and soup.table.tr.th.string == 'Place: Temperature': places = [] for td in soup.table.findAll('td'): places.append(td.find('a', href=re.compile('.*html$')).string) # Cities with more than one airport give duplicate entries. We can take the first if len([x for x in places if x == places[0]]) == len(places): url = urljoin( 'http://m.wund.com/cgi-bin/findweather/getForecast', soup.table.find('td').find( 'a', href=re.compile('.*html$'))['href']) soup = get_html_parse_tree(url) else: raise Weather.TooManyPlacesException(places) return soup
def _get_page(self, place): if place.lower() in self.places: place = self.places[place.lower()] soup = get_html_parse_tree( 'http://m.wund.com/cgi-bin/findweather/getForecast?' + urlencode({'brand': 'mobile_metric', 'query': place.encode('utf-8')})) if soup.body.center and soup.body.center.b.string == 'Search not found:': raise Weather.WeatherException(u'City not found') if soup.table.tr.th and soup.table.tr.th.string == 'Place: Temperature': places = [] for td in soup.table.findAll('td'): places.append(td.find('a', href=re.compile('.*html$')).string) # Cities with more than one airport give duplicate entries. We can take the first if len([x for x in places if x == places[0]]) == len(places): url = urljoin('http://m.wund.com/cgi-bin/findweather/getForecast', soup.table.find('td').find('a', href=re.compile('.*html$'))['href']) soup = get_html_parse_tree(url) else: raise Weather.TooManyPlacesException(places) return soup
def scrape_status(self, stream): tree = get_html_parse_tree(self.streams[stream]['url'] + 'status.xsl', treetype='etree') main_table = tree.findall('.//table')[2] status = {} for row in main_table.findall('.//tr'): key, value = [x.text for x in row.findall('td')] status[key[:-1]] = value return status
def _google_scrape_search(self, query, country=None): params = {'q': query.encode('utf-8')} if country: params['cr'] = u'country' + country.upper() return get_html_parse_tree( 'http://www.google.com/search?' + urlencode(params), headers={'user-agent': self.user_agent}, treetype='etree')
def _login(self, user, password): params = urlencode({'NAME': user.encode('utf-8'), 'PASSWORD': password.encode('utf-8')}) try: etree = get_html_parse_tree(u'http://ace.delos.com/usacogate', data=params, treetype=u'etree') except URLError: raise UsacoException(u'Sorry, USACO (or my connection?) is down') for font in etree.getiterator(u'font'): if font.text and u'Please try again' in font.text: return None return etree
def _get_title(self, url): "Gets the title of a page" try: headers = {'User-Agent': 'Mozilla/5.0'} etree = get_html_parse_tree(url, None, headers, 'etree') title = etree.findtext('head/title') return title or url except Exception, e: log.debug(u"Error determining title for %s: %s", url, unicode(e)) return url
def _google_scrape_search(self, query, country=None): params = {"q": query.encode("utf-8")} if country: params["cr"] = u"country" + country.upper() return get_html_parse_tree( "http://www.google.com/search?" + urlencode(params), headers={"user-agent": self.user_agent}, treetype="etree", )
def _login(self, user, password): params = urlencode({"NAME": user.encode("utf-8"), "PASSWORD": password.encode("utf-8")}) try: etree = get_html_parse_tree(u"http://ace.delos.com/usacogate", data=params, treetype=u"etree") except URLError: raise UsacoException(u"Sorry, USACO (or my connection?) is down") for font in etree.getiterator(u"font"): if font.text and u"Please try again" in font.text: return None return etree
def _add_user(self, monitor_url, user): matches = re.search(r'a=(.+)&', monitor_url) auth = matches.group(1) params = urlencode({'STUDENTID': user.encode('utf-8'), 'ADD': 'ADD STUDENT', 'a': auth.encode('utf-8'), 'monitor': '1'}) try: etree = get_html_parse_tree(monitor_url, treetype=u'etree', data=params) except URLError: raise UsacoException(u'Sorry, USACO (or my connection?) is down') for font in etree.getiterator(u'font'): if font.text and u'No STATUS file for' in font.text: raise UsacoException(u'Sorry, user %s not found' % user)
def _add_user(self, monitor_url, user): matches = re.search(r"a=(.+)&", monitor_url) auth = matches.group(1) params = urlencode( {"STUDENTID": user.encode("utf-8"), "ADD": "ADD STUDENT", "a": auth.encode("utf-8"), "monitor": "1"} ) try: etree = get_html_parse_tree(monitor_url, treetype=u"etree", data=params) except URLError: raise UsacoException(u"Sorry, USACO (or my connection?) is down") for font in etree.getiterator(u"font"): if font.text and u"No STATUS file for" in font.text: raise UsacoException(u"Sorry, user %s not found" % user)
def __init__(self, char): self.char = char url = 'http://www.unicode.org/cgi-bin/GetUnihanData.pl?' params = {'codepoint': self.char.encode('utf8'), 'useuft8': 'true'} soup = get_html_parse_tree(url + urlencode(params), treetype='html5lib-beautifulsoup') tables = soup('table', border="1") self.data = defaultdict(unicode, ((html_flatten(td).strip() for td in row('td')) for table in tables for row in table('tr')[1:]))
def find_stories(self, url): if isinstance(url, basestring): tree = get_html_parse_tree(url, treetype='etree') else: tree = url stories = [div for div in tree.findall('.//div') if div.get(u'class') == u'story s'] for story in stories: body = story.findtext('div').strip() id = story.findtext('.//a') if isinstance(id, basestring) and id[1:].isdigit(): id = int(id[1:]) yield id, body
def mlia(self, event, query): query = query is None and u'random' or query.lower() if query == u'random' and event.public and not self.public_browse: event.addresponse(u'Sorry, not in public. PM me') return url = 'http://mylifeisaverage.com/' if query == u'random' or query is None: if not self.random_pool: purl = url + str(randint(1, self.pages)) tree = get_html_parse_tree(purl, treetype='etree') self.random_pool = list(self.find_stories(tree)) shuffle(self.random_pool) pagination = [ ul for ul in tree.findall('.//ul') if ul.get(u'class') == u'pages' ][0] self.pages = int([ li for li in pagination.findall('li') if li.get(u'class') == u'last' ][0].find(u'a').get(u'href')) story = self.random_pool.pop() else: try: if query.isdigit(): surl = url + '/s/' + query else: surl = url + '/best/' + query story = self.find_stories(surl).next() except StopIteration: event.addresponse(u'No such quote') return id, body = story url += 's/%i' % id event.addresponse(u'%(body)s\n- %(url)s', { 'url': url, 'body': body, })
def find_stories(self, url): if isinstance(url, basestring): tree = get_html_parse_tree(url, treetype='etree') else: tree = url stories = [ div for div in tree.findall('.//div') if div.get(u'class') == u'story s' ] for story in stories: body = story.findtext('div').strip() id = story.findtext('.//a') if isinstance(id, basestring) and id[1:].isdigit(): id = int(id[1:]) yield id, body
def _load_currencies(self): etree = get_html_parse_tree('http://www.xe.com/iso4217.php', headers={ 'User-Agent': 'Mozilla/5.0', 'Referer': 'http://www.xe.com/', }, treetype='etree') tbl_main = [ x for x in etree.getiterator('table') if x.get('class') == 'tbl_main' ][0] self.currencies = {} for tbl_sub in tbl_main.getiterator('table'): if tbl_sub.get('class') == 'tbl_sub': for tr in tbl_sub.getiterator('tr'): code, place = [x.text for x in tr.getchildren()] name = u'' if not place: place = u'' if u',' in place[1:-1]: place, name = place.split(u',', 1) place = place.strip() if code in self.currencies: currency = self.currencies[code] # Are we using another country's currency? if place != u'' and name != u'' and ( currency[1] == u'' or currency[1].rsplit(None, 1)[0] in place or (u'(also called' in currency[1] and currency[1].split(u'(', 1)[0].rsplit( None, 1)[0] in place)): currency[0].insert(0, place) currency[1] = name.strip() else: currency[0].append(place) else: self.currencies[code] = [[place], name.strip()] # Special cases for shared currencies: self.currencies['EUR'][0].insert(0, u'Euro Member Countries') self.currencies['XOF'][0].insert( 0, u'Communaut\xe9 Financi\xe8re Africaine') self.currencies['XOF'][1] = u'Francs'
def get_tfln(self, section): tree = get_html_parse_tree('http://textsfromlastnight.com/%s' % section, treetype='etree') ul = [x for x in tree.findall('.//ul') if x.get('id') == 'texts-list'][0] id_re = re.compile('^/Text-Replies-(\d+)\.html$') for li in ul.findall('li'): id = 0 message='' div = [x for x in li.findall('div') if x.get('class') == 'text'][0] for a in div.findall('.//a'): href = a.get('href') if href.startswith('/Texts-From-Areacode-'): message += u'\n' + a.text elif href.startswith('/Text-Replies-'): id = int(id_re.match(href).group(1)) message += a.text yield id, message.strip()
def mlia(self, event, query): query = query is None and u'random' or query.lower() if query == u'random' and event.public and not self.public_browse: event.addresponse(u'Sorry, not in public. PM me') return url = 'http://mylifeisaverage.com/' if query == u'random' or query is None: if not self.random_pool: purl = url + str(randint(1, self.pages)) tree = get_html_parse_tree(purl, treetype='etree') self.random_pool = list(self.find_stories(tree)) shuffle(self.random_pool) pagination = [ul for ul in tree.findall('.//ul') if ul.get(u'class') == u'pages'][0] self.pages = int( [li for li in pagination.findall('li') if li.get(u'class') == u'last'][0] .find(u'a').get(u'href')) story = self.random_pool.pop() else: try: if query.isdigit(): surl = url + '/s/' + query else: surl = url + '/best/' + query story = self.find_stories(surl).next() except StopIteration: event.addresponse(u'No such quote') return id, body = story url += 's/%i' % id event.addresponse(u'%(body)s\n- %(url)s', { 'url': url, 'body': body, })
def add(self, event, url, name): feed = event.session.query(Feed).filter_by(name=name).first() if feed: event.addresponse(u"I already have the %s feed", name) return valid = bool(feedparser.parse(url)["version"]) if not valid: try: soup = get_html_parse_tree(url) for alternate in soup.findAll( 'link', { 'rel': 'alternate', 'type': re.compile(r'^application/(atom|rss)\+xml$'), 'href': re.compile(r'.+') }): newurl = urljoin(url, alternate["href"]) valid = bool(feedparser.parse(newurl)["version"]) if valid: url = newurl break except: pass if not valid: event.addresponse( u'Sorry, I could not add the %(name)s feed. ' u'%(url)s is not a valid feed', { 'name': name, 'url': url, }) return feed = Feed(unicode(name), unicode(url), event.identity) event.session.save(feed) event.session.commit() event.addresponse(True) log.info(u"Added feed '%s' by %s/%s (%s): %s (Found %s entries)", name, event.account, event.identity, event.sender['connection'], url, len(feed.entries))
def get_tfln(self, section): tree = get_html_parse_tree('http://textsfromlastnight.com/%s' % section, treetype='etree') ul = [x for x in tree.findall('.//ul') if x.get('id') == 'texts-list'][0] id_re = re.compile('^/Text-Replies-(\d+)\.html$') for li in ul.findall('li'): id = 0 message = '' div = [x for x in li.findall('div') if x.get('class') == 'text'][0] for a in div.findall('.//a'): href = a.get('href') if href.startswith('/Texts-From-Areacode-'): message += u'\n' + a.text elif href.startswith('/Text-Replies-'): id = int(id_re.match(href).group(1)) message += a.text yield id, message.strip()
def dinner(self, event, who, veg): url = 'http://www.whatthefuckshouldimakefordinner.com/' if veg: url += 'veg.php' soup = get_html_parse_tree(url, headers={'Cache-Control': 'max-age=0'}) link = soup.find('a') recipe = u''.join(link.contents) if ('f**k' in event.message['raw'].lower() or 'wtf' in event.message['raw'].lower()): template = u"Try some f*****g %(recipe)s. If you're too thick " \ u"to work it out for yourself, there's a recipe at " \ u"%(link)s" else: template = u"Try some %(recipe)s. If you can't " \ u"work it out for yourself, there's a recipe at " \ u"%(link)s" event.addresponse(template, {'recipe': recipe, 'link': link['href']})
def dinner (self, event, who, veg): url = 'http://www.whatthefuckshouldimakefordinner.com/' if veg: url += 'veg.php' soup = get_html_parse_tree(url, headers={'Cache-Control': 'max-age=0'}) link = soup.find('a') recipe = u''.join(link.contents) if ('f**k' in event.message['raw'].lower() or 'wtf' in event.message['raw'].lower()): template = u"Try some f*****g %(recipe)s. If you're too thick " \ u"to work it out for yourself, there's a recipe at " \ u"%(link)s" else: template = u"Try some %(recipe)s. If you can't " \ u"work it out for yourself, there's a recipe at " \ u"%(link)s" event.addresponse(template, {'recipe': recipe, 'link': link['href']})
def bash(self, event, id): id = id is None and u'random' or id.lower() if id == u'random' and event.public and not self.public_browse: event.addresponse(u'Sorry, not in public. PM me') return soup = get_html_parse_tree('http://bash.org/?%s' % id) number = u"".join(soup.find('p', 'quote').find('b').contents) output = [u'%s:' % number] body = soup.find('p', 'qt') if not body: event.addresponse(u"There's no such quote, but if you keep talking like that maybe there will be") else: for line in body.contents: line = unicode(line).strip() if line != u'<br />': output.append(line) event.addresponse(u'\n'.join(output), conflate=False)
def add(self, event, url, name): feed = event.session.query(Feed).filter_by(name=name).first() if feed: event.addresponse(u"I already have the %s feed", name) return valid = bool(feedparser.parse(url)["version"]) if not valid: try: soup = get_html_parse_tree(url) for alternate in soup.findAll('link', {'rel': 'alternate', 'type': re.compile(r'^application/(atom|rss)\+xml$'), 'href': re.compile(r'.+')}): newurl = urljoin(url, alternate["href"]) valid = bool(feedparser.parse(newurl)["version"]) if valid: url = newurl break except: pass if not valid: event.addresponse(u'Sorry, I could not add the %(name)s feed. ' u'%(url)s is not a valid feed', { 'name': name, 'url': url, }) return feed = Feed(unicode(name), unicode(url), event.identity) event.session.add(feed) event.session.commit() event.addresponse(True) log.info(u"Added feed '%s' by %s/%s (%s): %s (Found %s entries)", name, event.account, event.identity, event.sender['connection'], url, len(feed.entries))
def bash(self, event, id): id = id is None and u'random' or id.lower() if id == u'random' and event.public and not self.public_browse: event.addresponse(u'Sorry, not in public. PM me') return soup = get_html_parse_tree('http://bash.org/?%s' % id) number = u"".join(soup.find('p', 'quote').find('b').contents) output = [u'%s:' % number] body = soup.find('p', 'qt') if not body: event.addresponse( u"There's no such quote, but if you keep talking like that maybe there will be" ) else: for line in body.contents: line = unicode(line).strip() if line != u'<br />': output.append(line) event.addresponse(u'\n'.join(output), conflate=False)
def _get_section(self, monitor_url, usaco_user, user): try: etree = get_html_parse_tree(monitor_url, treetype=u'etree') except URLError: raise UsacoException(u'Sorry, USACO (or my connection?) is down') usaco_user = usaco_user.lower() header = True for tr in etree.getiterator(u'tr'): if header: header = False continue tds = [t.text for t in tr.getiterator(u'td')] section = u'is on section %s' % tds[5] if tds[5] == u'DONE': section = u'has completed USACO training' if tds[0] and tds[0].lower() == usaco_user: return u'%(user)s (%(usaco_user)s on USACO) %(section)s and last logged in %(days)s ago' % { 'user': user, 'usaco_user': usaco_user, 'days': tds[3], 'section': section, } return None
def _load_currencies(self): etree = get_html_parse_tree( 'http://www.xe.com/iso4217.php', headers = { 'User-Agent': 'Mozilla/5.0', 'Referer': 'http://www.xe.com/', }, treetype='etree') tbl_main = [x for x in etree.getiterator('table') if x.get('class') == 'tbl_main'][0] self.currencies = {} for tbl_sub in tbl_main.getiterator('table'): if tbl_sub.get('class') == 'tbl_sub': for tr in tbl_sub.getiterator('tr'): code, place = [x.text for x in tr.getchildren()] name = u'' if not place: place = u'' if u',' in place[1:-1]: place, name = place.split(u',', 1) place = place.strip() if code in self.currencies: currency = self.currencies[code] # Are we using another country's currency? if place != u'' and name != u'' and (currency[1] == u'' or currency[1].rsplit(None, 1)[0] in place or (u'(also called' in currency[1] and currency[1].split(u'(', 1)[0].rsplit(None, 1)[0] in place)): currency[0].insert(0, place) currency[1] = name.strip() else: currency[0].append(place) else: self.currencies[code] = [[place], name.strip()] # Special cases for shared currencies: self.currencies['EUR'][0].insert(0, u'Euro Member Countries') self.currencies['XOF'][0].insert(0, u'Communaut\xe9 Financi\xe8re Africaine') self.currencies['XOF'][1] = u'Francs'
def _get_section(self, monitor_url, usaco_user, user): try: etree = get_html_parse_tree(monitor_url, treetype=u"etree") except URLError: raise UsacoException(u"Sorry, USACO (or my connection?) is down") usaco_user = usaco_user.lower() header = True for tr in etree.getiterator(u"tr"): if header: header = False continue tds = [t.text for t in tr.getiterator(u"td")] section = u"is on section %s" % tds[5] if tds[5] == u"DONE": section = u"has completed USACO training" if tds[0] and tds[0].lower() == usaco_user: return u"%(user)s (%(usaco_user)s on USACO) %(section)s and last logged in %(days)s ago" % { "user": user, "usaco_user": usaco_user, "days": tds[3], "section": section, } return None
event.addresponse(self._get_section(monitor_url, usaco_user, user)) except UsacoException, e: event.addresponse(e) return @match(r"^usaco\s+division\s+(?:for\s+)?(.+)$") def get_division(self, event, user): try: usaco_user = self._get_usaco_user(event, user) except UsacoException, e: event.addresponse(e) return params = urlencode({"id": usaco_user.encode("utf-8"), "search": "SEARCH"}) try: etree = get_html_parse_tree(u"http://ace.delos.com/showdiv", data=params, treetype=u"etree") except URLError: event.addresponse(u"Sorry, USACO (or my connection?) is down") division = [b.text for b in etree.getiterator(u"b") if b.text and usaco_user in b.text][0] if division.find(u"would compete") != -1: event.addresponse( u"%(user)s (%(usaco_user)s on USACO) has not competed in a USACO before", {u"user": user, u"usaco_user": usaco_user}, ) matches = re.search(r"(\w+) Division", division) division = matches.group(1).lower() event.addresponse( u"%(user)s (%(usaco_user)s on USACO) is in the %(division)s division", {u"user": user, u"usaco_user": usaco_user, u"division": division}, )
event.addresponse(self._get_section(monitor_url, usaco_user, user)) except UsacoException, e: event.addresponse(e) return @match(r'^usaco\s+division\s+(?:for\s+)?(.+)$') def get_division(self, event, user): try: usaco_user = self._get_usaco_user(event, user) except UsacoException, e: event.addresponse(e) return params = urlencode({'id': usaco_user.encode('utf-8'), 'search': 'SEARCH'}) try: etree = get_html_parse_tree(u'http://ace.delos.com/showdiv', data=params, treetype=u'etree') except URLError: event.addresponse(u'Sorry, USACO (or my connection?) is down') division = [b.text for b in etree.getiterator(u'b') if b.text and usaco_user in b.text][0] if division.find(u'would compete') != -1: event.addresponse(u'%(user)s (%(usaco_user)s on USACO) has not competed in a USACO before', {u'user': user, u'usaco_user': usaco_user}) matches = re.search(r'(\w+) Division', division) division = matches.group(1).lower() event.addresponse(u'%(user)s (%(usaco_user)s on USACO) is in the %(division)s division', {u'user': user, u'usaco_user': usaco_user, u'division': division}) def _redact(self, event, term): for type in ['raw', 'deaddressed', 'clean', 'stripped']: event['message'][type] = re.sub(r'(.*)(%s)' % re.escape(term), r'\1[redacted]', event['message'][type])
def _flight_search(self, event, dpt, to, dep_date, ret_date): airport_dpt = self._airport_search(dpt) airport_to = self._airport_search(to) if len(airport_dpt) == 0: event.addresponse(u"Sorry, I don't know the airport you want to leave from") return if len(airport_to) == 0: event.addresponse(u"Sorry, I don't know the airport you want to fly to") return if len(airport_dpt) > 1: event.addresponse(u'The following airports match the departure: %s', human_join(self.repr_airport(id) for id in airport_dpt)[:480]) return if len(airport_to) > 1: event.addresponse(u'The following airports match the destination: %s', human_join(self.repr_airport(id) for id in airport_to)[:480]) return dpt = airport_dpt[0] to = airport_to[0] def to_travelocity_date(date): date = date.lower() time = None for period in [u'anytime', u'morning', u'afternoon', u'evening']: if period in date: time = period.title() date = date.replace(period, u'') break try: date = parse(date) except ValueError: raise FlightException(u"Sorry, I can't understand the date %s" % date) if time is None: if date.hour == 0 and date.minute == 0: time = u'Anytime' else: time = date.strftime('%I:00') if time[0] == u'0': time = time[1:] if date.hour < 12: time += u'am' else: time += u'pm' date = date.strftime('%m/%d/%Y') return (date, time) (dep_date, dep_time) = to_travelocity_date(dep_date) (ret_date, ret_time) = to_travelocity_date(ret_date) params = {} params[u'leavingFrom'] = self.airports[dpt][3] params[u'goingTo'] = self.airports[to][3] params[u'leavingDate'] = dep_date params[u'dateLeavingTime'] = dep_time params[u'returningDate'] = ret_date params[u'dateReturningTime'] = ret_time etree = get_html_parse_tree('http://travel.travelocity.com/flights/InitialSearch.do', data=urlencode(params), treetype='etree') while True: script = [script for script in etree.getiterator(u'script')][1] matches = script.text and re.search(r'var finurl = "(.*)"', script.text) if matches: url = u'http://travel.travelocity.com/flights/%s' % matches.group(1) etree = get_html_parse_tree(url, treetype=u'etree') else: break # Handle error div = [d for d in etree.getiterator(u'div') if d.get(u'class') == u'e_content'] if len(div): error = div[0].find(u'h3').text raise FlightException(error) departing_flights = self._parse_travelocity(etree) return_url = None table = [t for t in etree.getiterator(u'table') if t.get(u'id') == u'tfGrid'][0] for tr in table.getiterator(u'tr'): for td in tr.getiterator(u'td'): if td.get(u'class').strip() in [u'tfPrice', u'tfPriceOrButton']: onclick = td.find(u'div/button').get(u'onclick') match = re.search(r"location.href='\.\./flights/(.+)'", onclick) url_page = match.group(1) match = re.search(r'^(.*?)[^/]*$', url) url_base = match.group(1) return_url = url_base + url_page etree = get_html_parse_tree(return_url, treetype=u'etree') returning_flights = self._parse_travelocity(etree) return (departing_flights, returning_flights, url)