def crawler(u, n): url = 'https://m.douban.com/rexxar/api/v2/status/user_timeline/' path = n + os.sep + 'status' if not os.path.isdir(path): os.makedirs(path) url += u query = '' while True: print url + query status = json.loads(util.open_url(url + query)) count = len(status["items"]) if count <= 0: break for item in status['items']: item = item["status"] jpath = os.path.join(path, item['id'] + '.json') if os.path.isfile(jpath): break #continue with open(jpath, 'wb') as f: f.write(json.dumps(item)) idx = 0 for image in item['images']: img_url = image['large']["url"] suffix = img_url[img_url.rfind('/') + 1:] with open( os.path.join( path, item['id'] + '_' + str(idx) + '.' + suffix), 'wb') as f: f.write(util.open_url(img_url)) idx += 1 query = '?max_id=' + status["items"][len(status['items']) - 1]["status"]["id"]
def crawler(u, n): url = 'https://m.douban.com/rexxar/api/v2/status/user_timeline/' path = n + os.sep + 'status' if not os.path.isdir(path): os.makedirs(path) url += u query = '' while True: print url + query status = json.loads(util.open_url(url + query)) count = len(status["items"]) if count <= 0: break for item in status['items']: item = item["status"] jpath = os.path.join(path, item['id'] + '.json') if os.path.isfile(jpath): break #continue with open(jpath, 'wb') as f: f.write(json.dumps(item)) idx = 0 for image in item['images']: img_url = image['large']["url"] suffix = img_url[img_url.rfind('/')+1:] with open(os.path.join(path, item['id'] + '_' + str(idx) + '.' + suffix), 'wb') as f: f.write(util.open_url(img_url)) idx += 1 query = '?max_id=' + status["items"][len(status['items'])-1]["status"]["id"]
def get_game_datetime(self): url = self.url_provider.get_page_url('overview') res = util.open_url(self.browser, url) soup = BeautifulSoup(res.read(), "lxml") datetime_data = soup.find("li", {"class": "OGameClock"}).text game_datetime = datetime.datetime.strptime(datetime_data, "%d.%m.%Y %H:%M:%S") return game_datetime
def get_resources(self, planet): self.logger.info('Getting resources data for planet %s' % planet.name) url = self.url_provider.get_page_url('resources', planet) res = util.open_url(self.browser, url) soup = BeautifulSoup(res.read()) resources = [] metal = int(soup.find(id='resources_metal').text.replace('.','')) crystal = int(soup.find(id='resources_crystal').text.replace('.','')) deuterium = int(soup.find(id='resources_deuterium').text.replace('.','')) energy = int(soup.find(id='resources_energy').text.replace('.','')) return Resources(metal, crystal, deuterium, energy)
def get_planets(self): self.logger.info('Getting planets') url = self.url_provider.get_page_url('resources') res = util.open_url(self.browser, url) soup = BeautifulSoup(res.read()) planets = [] current_planet_id = soup.find("meta", { "name" : "ogame-planet-id"})['content'] current_planet_name = soup.find("meta", { "name" : "ogame-planet-name"})['content'] current_planet_koords = soup.find("meta", { "name" : "ogame-planet-coordinates"})['content'] current_planet = Planet(current_planet_name, current_planet_id, current_planet_koords) planets.append(current_planet) links = soup.findAll("a", { "class" : "planetlink tooltipRight js_hideTipOnMobile" }) other_planets = [ Planet((str(link.find("span", {"class" : "planet-name "}).contents[0])), urlparse.parse_qs(link['href'])['cp'][0], self.parse_coordinates(str(link.find("span", {"class" : "planet-koords "}).contents[0]))) for link in links] if len(other_planets) > 1: planets.extend(other_planets) return planets
def log_index_page(self): """Logs the index page, used for test purposes""" url = self.url_provider.get_page_url('overview') res = util.open_url(self.browser, url) self.logger.info(res.read())
def import_json_from_url(url, description, batch_size, key = None): try: if batch_size is not None: log.info("Batch downloading {0} list from {1} ... ".format(description, url)) sys.stdout.flush() start = int(time()) done = 0 failed = 0 last_elapsed = 0 batch = [] encoded = '' stream = util.open_url(url) if stream is None: return while True: line = util.read_stream_line(stream) if not line: break m = _re_json_line.match(line) if m is None: continue try: obj = json.loads(m.group(1)) except ValueError: log.debug("Line failed JSON parse: {0}".format(line)) failed += 1 continue batch.append(obj) if len(batch) >= batch_size: for obj in batch: yield obj done += len(batch) elapsed = int(time()) - start if elapsed - last_elapsed >= 30: log.info("Loaded {0} row(s) of {1} data to DB...".format(done, description)) last_elapsed = elapsed batch = [] if len(batch) >= batch_size: for obj in batch: yield obj done += len(batch) log.info("Loaded {0} row(s) of {1} data to DB...".format(done, description)) done += len(batch) for obj in batch: yield obj if failed: log.info("Lines failing JSON parse: {0}".format(failed)) log.info("Loaded {0} row(s) of {1} data to DB...".format(done, description)) log.info("Done.") else: log.info("Downloading {0} list from {1} ... ".format(description, url)) sys.stdout.flush() encoded = util.read_from_url(url) log.info("Done.") log.info("Loading {0} data...".format(description)) sys.stdout.flush() obj = json.loads(encoded) log.info("Done.") log.info("Adding {0} data to DB...".format(description)) sys.stdout.flush() if key is not None: obj = obj[key] for o in obj: yield o log.info("Done.") # Force GC collection to try to avoid memory errors encoded = None obj = None batch = None gc.collect() except MemoryError: encoded = None obj = None batch = None gc.collect() raise