def get_url(url, time=20): try: output = wget(url, timeout=time).read() except urllib2.HTTPError, e: print(e.code) error_message = e.code print(error_message)
def populateList(): '''first, we get the whole list of pokemon, sorted by national dex number. there is also a regional dex number, which i will preserve later. returns a tuple in the form (name, url_suffix). ''' path = URL + "/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number" page = wget(path) soup = bs(page.read(), 'html.parser') tables = soup.findAll('table') # - tables[1] is the list of kanto (kdex) pokemon. # - tables[2] is the list of jhoto (jdex) pokemon. # - tables[3] is the list of hoenn (hdex) pokemon. # - tables[4] is the list of sinnoh (sdex) pokemon. # - tables[5] is the list of unova (udex) pokemon. # - tables[6] is the list of kalos pokemon. kalos is special because the region is # split into 3 sub regions, central (cekdex), coastal (cokdex), and mountain (mokdex). # - tables[7] is the list of alola (adex) pokemon. it is not populated, as the region # is part of the gen VII game release (not released yet). # get a list of pokemon pokemon = [] for table in tables[:7]: # ignoring alola region for now entries = bs(table.__str__(), 'html.parser').findAll('tr') for entry in entries[1:]: # entries[0] defines column headers. entry = bs(entry.__str__(), 'html.parser') info = entry.findAll('td')[3] poke = (info.a.contents[0], info.a['href']) if poke not in pokemon: # there are duplicate entries. some pokemon have different "states". pokemon.append(poke) # using a dictionary reorders, lets stay in order for debugging's sake. return pokemon
def download_http_photo(self, url, user_profile): if url is not None: try: response = wget(url) # first param for extension user_profile.photo.save(url, ContentFile(response.read())) except Exception as e: logger.error( "Unable to download photo from url %s for user %s because %s", url, user_profile.username, e)
def cullPokemonData(pokeTuple): '''Grabs data for a single pokemon.''' path = URL + pokeTuple[1] page = wget(path) sys.stdout.write(".") soup = bs(page.read(), 'html.parser') table = soup.find('table', {'class':'roundy'}) # at this point, I have the right table. need to parse out the following values. element = table.find("td", {"width" : "50%"}) name = element.big.big.b.contents[0] if "Nidoran" in name: name = name[:-1] # print "name >>>", name # debug # to account for inline explain spans if len(element.a.span.contents) > 1: category = element.a.span.span.contents[0] + element.a.span.contents[1] else: category = element.a.span.contents[0] category = re.sub("\xe9", "e", category) # print "cat >>>", category # debug sys.stdout.write(".") element = table.find("th", {"width" : "25%", "class" : "roundy", "style" : "background:#FFF;"}) natdex = element.big.big.a.span.contents[0] # print "natdex >>>", natdex # debug _type = "" element = table.find("td", {"class" : "roundy", "colspan" : "4"}) types = element.findAll("td") if types[0].a.span.b is None: element = table.find("td", {"class" : "roundy", "colspan" : "2"}) element = element.table.tr.td.table.tr types = element.findAll("td") for t in types: if t.a.span.b.contents[0] != "Unknown": _type += t.a.span.b.contents[0] + " " # print "type >>>", _type # debug sys.stdout.write(".") script = 'INSERT INTO pokemon(name, category, natdex, type) VALUES ("%s", "%s", "%s", "%s")' % (name, category, natdex, _type) try: cursor.execute(script) out = cursor.fetchone() if out: print out db.commit() except: db.rollback()