def get_room_info_from_web_site(self, flag): """ Get the room properties from the web site """ try: # initialization logger.info("-" * 70) logger.info("Room " + str(self.room_id) + ": getting from Airbnb web site") room_url = self.config.URL_ROOM_ROOT + str(self.room_id) response = airbnb_ws.ws_request_with_repeats(self.config, room_url) if response is not None: page = response.text tree = html.fromstring(page) self.__get_room_info_from_tree(tree, flag) logger.info("Room %s: found", self.room_id) return True else: logger.info("Room %s: not found", self.room_id) return False except (KeyboardInterrupt, SystemExit): raise except Exception as ex: logger.exception("Room " + str(self.room_id) + ": failed to retrieve from web site.") logger.error("Exception: " + str(type(ex))) raise
def get_search_page_info_zipcode(self, room_type, zipcode, guests, section_offset, flag): try: logger.info("-" * 70) logger.info(room_type + ", zipcode " + str(zipcode) + ", " + str(guests) + " guests, " + "page " + str(section_offset + 1)) room_count = 0 new_rooms = 0 params = {} params["guests"] = str(guests) params["section_offset"] = str(section_offset) params["source"] = "filter" params["location"] = zipcode params["room_types[]"] = room_type response = airbnb_ws.ws_request_with_repeats( self.config, self.config.URL_API_SEARCH_ROOT, params) json = response.json() for result in json["results_json"]["search_results"]: room_id = int(result["listing"]["id"]) if room_id is not None: room_count += 1 listing = self.listing_from_search_page_json( result, room_id) if listing is None: continue if listing.host_id is not None: listing.deleted = 0 if flag == self.config.FLAGS_ADD: if listing.save( self.config.FLAGS_INSERT_NO_REPLACE): new_rooms += 1 elif flag == self.config.FLAGS_PRINT: print(room_type, listing.room_id) if room_count > 0: has_rooms = 1 else: has_rooms = 0 if flag == self.config.FLAGS_ADD: self.log_progress(room_type, zipcode, guests, section_offset, has_rooms) else: logger.info("No rooms found") return room_count except UnicodeEncodeError: logger.error( "UnicodeEncodeError: you may want to set PYTHONIOENCODING=utf-8" ) # if sys.version_info >= (3,): # logger.info(s.encode('utf8').decode(sys.stdout.encoding)) # else: # logger.info(s.encode('utf8')) # unhandled at the moment except Exception as e: logger.error("Exception type: " + type(e).__name__) raise
def __search_neighborhood_page(self, room_type, neighborhood, guests, section_offset, flag): try: logger.info("-" * 70) logger.info(room_type + ", " + str(neighborhood) + ", " + str(guests) + " guests, " + "page " + str(section_offset)) new_rooms = 0 room_count = 0 params = {} params["page"] = str(section_offset) params["source"] = "filter" params["location"] = self.search_area_name params["room_types[]"] = room_type params["neighborhoods[]"] = neighborhood response = airbnb_ws.ws_request_with_repeats( self.config, self.config.URL_API_SEARCH_ROOT, params) json = response.json() for result in json["results_json"]["search_results"]: room_id = int(result["listing"]["id"]) if room_id is not None: room_count += 1 listing = self.listing_from_search_page_json( result, room_id) if listing is None: continue if listing.host_id is not None: listing.deleted = 0 if flag == self.config.FLAGS_ADD: if listing.save( self.config.FLAGS_INSERT_NO_REPLACE): new_rooms += 1 elif flag == self.config.FLAGS_PRINT: print(room_type, listing.room_id) if room_count > 0: has_rooms = 1 else: has_rooms = 0 if flag == self.config.FLAGS_ADD: neighborhood_id = self.get_neighborhood_id(neighborhood) self.log_progress(room_type, neighborhood_id, guests, section_offset, has_rooms) return room_count except UnicodeEncodeError: logger.error("UnicodeEncodeError: set PYTHONIOENCODING=utf-8") # if sys.version_info >= (3,): # logger.info(s.encode('utf8').decode(sys.stdout.encoding)) # else: # logger.info(s.encode('utf8')) # unhandled at the moment except Exception: raise
def get_room_info_from_web_site(self, flag): """ Get the room properties from the web site """ try: # first check if we have a cached local version base_path = self.config.CACHE_PATH + "/" + str( self.config.SURVEY_ID) file = Path(base_path + "/" + str(self.room_id) + ".html") page = None if self.config.USE_CACHE and file.exists(): page = file.read_text() else: # initialization logger.info("-" * 70) logger.info("Room " + str(self.room_id) + ": getting from Airbnb web site") room_url = self.config.URL_ROOM_ROOT + str(self.room_id) response = airbnb_ws.ws_request_with_repeats( self.config, room_url) if response is not None: page = response.text if self.config.USE_CACHE: if not Path(base_path).exists(): os.makedirs(base_path) file.write_text(page) else: logger.info("Room %s: not found", self.room_id) return False if page: tree = html.fromstring(page) self.__get_room_info_from_tree(tree, flag) logger.info("Room %s: found", self.room_id) return True except (KeyboardInterrupt, SystemExit): raise except Exception as ex: logger.exception("Room " + str(self.room_id) + ": failed to retrieve from web site.") logger.error("Exception: " + str(type(ex))) raise
def search_node(self, room_type, guests, price_range, quadtree_node, median_node, flag): """ rectangle is (n_lat, e_lng, s_lat, w_lng) returns number of *new* rooms and number of pages tested """ try: logger.info("-" * 70) rectangle = self.get_rectangle_from_quadtree_node( quadtree_node, median_node) logger.info( ("Searching rectangle: {room_type}, guests = {guests}, " "prices in [{p1}, {p2}], zoom factor = {z}").format( room_type=room_type, guests=guests, p1=price_range[0], p2=price_range[1], z=len(quadtree_node))) logger.debug("quadtree_node = {quadtree_node}".format( quadtree_node=str(quadtree_node))) logger.debug( "Rectangle: N={n:+.5f}, E={e:+.5f}, S={s:+.5f}, W={w:+.5f}". format(n=rectangle[0], e=rectangle[1], s=rectangle[2], w=rectangle[3])) new_rooms = 0 room_total = 0 # median_lists are collected from results on each page and used to # calculate the median values, which will be used to divide the # volume into optimal "quadrants". median_lists = {} median_lists["latitude"] = [] median_lists["longitude"] = [] for page_number in range(1, self.config.SEARCH_MAX_PAGES + 1): room_count = 0 # set up the parameters for the request params = {} params["guests"] = str(guests) params["page"] = str(page_number) params["source"] = "filter" params["room_types[]"] = room_type params["sw_lat"] = str(rectangle[2]) params["sw_lng"] = str(rectangle[3]) params["ne_lat"] = str(rectangle[0]) params["ne_lng"] = str(rectangle[1]) params["search_by_map"] = str(True) params["price_min"] = str(price_range[0]) params["price_max"] = str(price_range[1]) # make the http request response = airbnb_ws.ws_request_with_repeats( self.config, self.config.URL_API_SEARCH_ROOT, params) # process the response if response is None: logger.warning( "No response received from request despite multiple attempts: {p}" .format(p=params)) continue json = response.json() for result in json["results_json"]["search_results"]: room_id = int(result["listing"]["id"]) if room_id is not None: room_count += 1 room_total += 1 listing = self.listing_from_search_page_json( result, room_id, room_type) median_lists["latitude"].append(listing.latitude) median_lists["longitude"].append(listing.longitude) if listing is None: continue if listing.host_id is not None: listing.deleted = 0 if flag == self.config.FLAGS_ADD: if listing.save( self.config.FLAGS_INSERT_NO_REPLACE): new_rooms += 1 elif flag == self.config.FLAGS_PRINT: print(room_type, listing.room_id) # Log page-level results logger.info( "Page {page_number:02d} returned {room_count:02d} listings" .format(page_number=page_number, room_count=room_count)) if flag == self.config.FLAGS_PRINT: # for FLAGS_PRINT, fetch one page and print it sys.exit(0) if room_count < self.config.SEARCH_LISTINGS_ON_FULL_PAGE: # If a full page of listings is not returned by Airbnb, # this branch of the search is complete. logger.debug("Final page of listings for this search") break # Log rectangle-level results logger.info( ("Results: {page_count} pages, {new_rooms} new rooms, " "{room_type}, {g} guests, prices in [{p1}, {p2}]").format( room_type=room_type, g=str(guests), p1=str(price_range[0]), p2=str(price_range[1]), new_rooms=str(new_rooms), page_count=str(page_number))) if len(median_node) == 0: median_leaf = "[]" else: median_leaf = median_node[-1] logger.info( "Results: rect = {median_leaf}, node = {quadtree_node}".format( quadtree_node=str(quadtree_node), median_leaf=str(median_leaf))) # calculate medians if room_count > 0: median_lat = sorted(median_lists["latitude"])[int( len(median_lists["latitude"]) / 2)] median_lng = sorted(median_lists["longitude"])[int( len(median_lists["longitude"]) / 2)] median_leaf = [median_lat, median_lng] else: # values not needed, but we need to fill in an item anyway median_leaf = [0, 0] # log progress self.log_progress(room_type, guests, price_range[0], price_range[1], quadtree_node, median_node) return (new_rooms, page_number, median_leaf) except UnicodeEncodeError: logger.error("UnicodeEncodeError: set PYTHONIOENCODING=utf-8") # if sys.version_info >= (3,): # logger.info(s.encode('utf8').decode(sys.stdout.encoding)) # else: # logger.info(s.encode('utf8')) # unhandled at the moment except Exception: logger.exception("Exception in get_search_page_info_rectangle") raise
def ws_get_city_info(config, city, flag): try: url = config.URL_SEARCH_ROOT + city response = airbnb_ws.ws_request_with_repeats(config, url) if response is None: return False tree = html.fromstring(response.text) try: citylist = tree.xpath("//input[@name='location']/@value") neighborhoods = tree.xpath( "//input[contains(@id, 'filter-option-neighborhoods')]/@value") if flag == config.FLAGS_PRINT: print("\n", citylist[0]) print("Neighborhoods:") for neighborhood in neighborhoods: print("\t", neighborhood) elif flag == config.FLAGS_ADD: if len(citylist) > 0: conn = config.connect() cur = conn.cursor() # check if it exists sql_check = """ select name from search_area where name = %s""" cur.execute(sql_check, (citylist[0], )) if cur.fetchone() is not None: logger.info("City already exists: " + citylist[0]) return sql_search_area = """insert into search_area (name) values (%s)""" cur.execute(sql_search_area, (citylist[0], )) # city_id = cur.lastrowid sql_identity = """select currval('search_area_search_area_id_seq') """ cur.execute(sql_identity, ()) search_area_id = cur.fetchone()[0] sql_city = """insert into city (name, search_area_id) values (%s,%s)""" cur.execute(sql_city, ( city, search_area_id, )) logger.info("Added city " + city) logger.debug(str(len(neighborhoods)) + " neighborhoods") if len(neighborhoods) > 0: sql_neighborhood = """ insert into neighborhood(name, search_area_id) values(%s, %s) """ for neighborhood in neighborhoods: cur.execute(sql_neighborhood, ( neighborhood, search_area_id, )) logger.info("Added neighborhood " + neighborhood) else: logger.info("No neighborhoods found for " + city) conn.commit() except UnicodeEncodeError: # if sys.version_info >= (3,): # logger.info(s.encode('utf8').decode(sys.stdout.encoding)) # else: # logger.info(s.encode('utf8')) # unhandled at the moment pass except Exception: logger.error("Error collecting city and neighborhood information") raise except Exception: logger.error("Error getting city info from website") raise
def search_node(self, quadtree_node, median_node, flag): """ rectangle is (n_lat, e_lng, s_lat, w_lng) returns number of *new* rooms and number of pages tested """ try: logger.info("-" * 70) rectangle = self.get_rectangle_from_quadtree_node( quadtree_node, median_node) logger.info( "Searching rectangle: zoom factor = {z}, node = {node}".format( z=len(quadtree_node), node=str(quadtree_node))) logger.debug( "Rectangle: N={n:+.5f}, E={e:+.5f}, S={s:+.5f}, W={w:+.5f}". format(n=rectangle[0], e=rectangle[1], s=rectangle[2], w=rectangle[3])) new_rooms = 0 room_quadtree_total = 0 # set zoomable to false if the search finishes withoug returning a # full complement of 20 pages, 18 listings per page zoomable = True # median_lists are collected from results on each page and used to # calculate the median values, which will be used to divide the # volume into optimal "quadrants". median_lists = {} median_lists["latitude"] = [] median_lists["longitude"] = [] for section_offset in range(0, self.config.SEARCH_MAX_PAGES): # section_offset is the zero-based counter used on the site # page number is convenient for logging, etc page_number = section_offset + 1 room_count = 0 # set up the parameters for the request params = {} params["source"] = "filter" params["refinement_paths[]"] = "homes" params["sw_lat"] = str(rectangle[2]) params["sw_lng"] = str(rectangle[3]) params["ne_lat"] = str(rectangle[0]) params["ne_lng"] = str(rectangle[1]) params["search_by_map"] = str(True) if section_offset > 0: params["section_offset"] = str(section_offset) # make the http request response = airbnb_ws.ws_request_with_repeats( self.config, self.config.URL_API_SEARCH_ROOT, params) # process the response # If no response, maybe it's a network problem rather than a lack of data, so # to be conservative go to the next page rather than the next rectangle if response is None: logger.warning( "No response received from request despite multiple attempts: {p}" .format(p=params)) continue soup = BeautifulSoup( response.content.decode("utf-8", "ignore"), "lxml") html_file = open("test.html", mode="w", encoding="utf-8") html_file.write(soup.prettify()) html_file.close() # The returned page includes a script tag that encloses a # comment. The comment in turn includes a complex json # structure as a string, which has the data we need spaspabundlejs_set = soup.find_all( "script", { "type": "application/json", "data-hypernova-key": "spaspabundlejs" }) if len(spaspabundlejs_set) > 0: logger.debug("Found spaspabundlejs tag") comment = spaspabundlejs_set[0].contents[0] # strip out the comment tags (everything outside the # outermost curly braces) json_doc = json.loads( comment[comment.find("{"):comment.rfind("}") + 1]) logger.debug("results-containing json found") else: logger.warning( "json results-containing script node " "(spaspabundlejs) not found in the web page: " "go to next page") return None # Now we have the json. It includes a list of 18 or fewer listings json_file = open("listing_json.json", mode="w", encoding="utf-8") json_file.write(json.dumps(json_doc, indent=4, sort_keys=True)) json_file.close() # Steal a function from StackOverflow which searches for items # with a given list of keys (in this case just one: "listing") # https://stackoverflow.com/questions/14048948/how-to-find-a-particular-json-value-by-key def search_json_keys(key, json_doc): """ Return a list of the values for each occurrence of key in json_doc, at all levels. In particular, "listings" occurs more than once, and we need to get them all.""" found = [] if isinstance(json_doc, dict): if key in json_doc.keys(): found.append(json_doc[key]) elif len(json_doc.keys()) > 0: for json_key in json_doc.keys(): result_list = search_json_keys( key, json_doc[json_key]) if result_list: found.extend(result_list) elif isinstance(json_doc, list): for item in json_doc: result_list = search_json_keys(key, item) if result_list: found.extend(result_list) return found # Get all items with tags "listings". Each json_listings is a # list, and each json_listing is a {listing, pricing_quote, verified} # dict for the listing in question # There may be multiple lists of listings json_listings_lists = search_json_keys("listings", json_doc) room_count = 0 for json_listings in json_listings_lists: for json_listing in json_listings: room_id = int(json_listing["listing"]["id"]) if room_id is not None: room_count += 1 room_quadtree_total += 1 listing = self.listing_from_search_page_json( json_listing, room_id) if listing is None: continue if listing.latitude is not None: median_lists["latitude"].append( listing.latitude) if listing.longitude is not None: median_lists["longitude"].append( listing.longitude) if listing.host_id is not None: listing.deleted = 0 if flag == self.config.FLAGS_ADD: if listing.save(self.config. FLAGS_INSERT_NO_REPLACE): new_rooms += 1 elif flag == self.config.FLAGS_PRINT: print(room_type, listing.room_id) # Log page-level results logger.info( "Page {page_number:02d} returned {room_count:02d} listings" .format(page_number=page_number, room_count=room_count)) if flag == self.config.FLAGS_PRINT: # for FLAGS_PRINT, fetch one page and print it sys.exit(0) if room_count < self.config.SEARCH_LISTINGS_ON_FULL_PAGE: # If a full page of listings is not returned by Airbnb, # this branch of the search is complete. logger.debug("Final page of listings for this search") zoomable = False break # Log node-level results logger.info( "Results: {page_count} pages, {new_rooms} new rooms".format( new_rooms=str(new_rooms), page_count=str(page_number))) # Median-based partitioning not currently in use: may use later if len(median_node) == 0: median_leaf = "[]" else: median_leaf = median_node[-1] # calculate medians if room_count > 0: median_lat = round( sorted(median_lists["latitude"])[int( len(median_lists["latitude"]) / 2)], 5) median_lng = round( sorted(median_lists["longitude"])[int( len(median_lists["longitude"]) / 2)], 5) median_leaf = [median_lat, median_lng] else: # values not needed, but we need to fill in an item anyway median_leaf = [0, 0] # log progress self.log_progress(quadtree_node, median_node) return (zoomable, median_leaf) except UnicodeEncodeError: logger.error("UnicodeEncodeError: set PYTHONIOENCODING=utf-8") # if sys.version_info >= (3,): # logger.info(s.encode('utf8').decode(sys.stdout.encoding)) # else: # logger.info(s.encode('utf8')) # unhandled at the moment except Exception: logger.exception("Exception in get_search_page_info_rectangle") raise