def run(self): db = SqlHelper(constants.SQL_DATABASE_NAME) db.create_session() if self.top_how_much is not None: df = db.fetch_table_as_dataframe('top_cities') cities_dataframe = pd.DataFrame(data=df.iloc[:self.top_how_much], columns={'city', 'state'}) cities = cities_dataframe.values.tolist() print(cities) elif self.city_name is None: city_objects = db.fetch_entity_where('TopCities') cities = [[city.state, city.city] for city in city_objects] else: city_objects = db.fetch_entity_where('TopCities', True, False, city=self.city_name) cities = [[city.state, city.city] for city in city_objects] all_results = [] try: for city in cities: self.current_city = city[1] self.current_state = city[0] result = { 'city': self.current_city, 'total': None, 'restaurants': [] } self.logger.info('Starting to scrape {0}'.format( self.current_city)) spider = SpeisekarteSpider(self.current_city) spider.run() spider_result = spider.get_result() success = spider_result.get_success() if success and not self.test_mode: data = spider_result.get_data() restaurants = data['restaurants'] total = data['total'] result['total'] = total for restaurant in restaurants: restaurant_id = restaurant['id'] entity_id = self.current_city + '$' + restaurant_id datastore_entity = self._create_datastore_entity( restaurant) success = self._save(entity_id, datastore_entity) restaurant_result = { 'success': success, 'content': restaurant_id } result['restaurants'].append(restaurant_result) all_results.append(result) except HTTPError as error: self.logger.exception( 'Encountered HTTP error %s on %s:\nAbort program.', error.code, error.url) except: self.logger.exception('Something went wrong') finally: db.close_session()
def _fetch_top_city_from(self, top_how_much, table_name): db = SqlHelper(constants.SQL_DATABASE_NAME) db.create_session() df = db.fetch_table_as_dataframe(table_name) self.logger.info('Fetching Top {0}'.format(top_how_much)) cities_dataframe = pd.DataFrame(data=df.iloc[:top_how_much], columns={'city'}) return cities_dataframe['city'].values.tolist()
def run(self): result = Result() db = SqlHelper(constants.SQL_DATABASE_NAME) df = db.fetch_table_as_dataframe('top_cities') cities = pd.DataFrame(data=df.iloc[0:self.top_how_much], columns={'city'}) for index, row in cities.iterrows(): self.logger.debug(str(index + 1) + ". " + row['city']) # cities = {'city': ['Heidelberg', 'Karlsruhe']} city_for_search = pd.DataFrame(cities, columns=['city']) immo_oauth = OAuth1( constants.IMMOSCOUT_CLIENT_KEY, client_secret=constants.IMMOSCOUT_CLIENT_SECRET, resource_owner_key=constants.IMMOSCOUT_RESOURCE_OWNER_KEY, resource_owner_secret=constants.IMMOSCOUT_RESOURCE_OWNER_SECRET) # create empty geo_df geo_df = pd.DataFrame(columns={'geoId', 'city'}) # get geoid from Immoscout24 API geo_df = self.get_geo_id(city_for_search, geo_df, immo_oauth) # Fläche Retaurant: # https: // se909eeccf1caa559.jimcontent.com / download / version / 1507517357 / module / 11096440527 / name / AuszugDiplomarbeit_13.03.2006.pdf # Gast = 40 % # Technik = 12 % # Personal = 8 % # Gast = 40 % total_floor_space_min = constants.FLOOR_SPACE_GUEST * constants.SEATS_MIN / 40 * 100.0 total_floor_space_max = constants.FLOOR_SPACE_GUEST * constants.SEATS_MAX / 40 * 100.0 restaurant_df = pd.DataFrame() # get Immoscout24 object by geocode for index, row in geo_df.iterrows(): params = { 'realestatetype': 'gastronomy', 'geocodes': str(row['geoId']), 'gastronomytypes': 'restaurant', 'channel': 'is24', 'numberofseats': str(constants.SEATS_MIN) + '-' + str(constants.SEATS_MAX), 'pagesize': '200', 'totalfloorspace': str(total_floor_space_min) + '-' + str(total_floor_space_max) } immo_search_response = requests.request( method='GET', url=constants.IMMOSCOUT_SEARCH_URL, params=params, headers=constants.IMMOSCOUT_HEADERS, auth=immo_oauth) immo_search_json = pd.read_json(immo_search_response.text) hits = immo_search_json['resultlist.resultlist'][0]['numberOfHits'] self.logger.info("Hits: " + str(hits) + " for city: " + str(row['city']) + "\r\n") if hits == 1: immo_object = immo_search_json['resultlist.resultlist'][1][0][ 'resultlistEntry']['resultlist.realEstate'] real_estate_id = immo_search_json['resultlist.resultlist'][1][ 0]['resultlistEntry']['resultlist.realEstate']['@id'] restaurant_df = restaurant_df.append(self.transform_df( immo_object, real_estate_id), ignore_index=True, sort=True) elif hits >= 1: for i in range(hits): immo_object = immo_search_json['resultlist.resultlist'][1][ 0]['resultlistEntry'][i]['resultlist.realEstate'] real_estate_id = immo_search_json['resultlist.resultlist'][ 1][0]['resultlistEntry'][i]['resultlist.realEstate'][ '@id'] restaurant_df = restaurant_df.append(self.transform_df( immo_object, real_estate_id), ignore_index=True, sort=True) else: self.logger.info('No object found for city: ' + str(row['city'])) self.logger.info(restaurant_df) result_json = restaurant_df.to_json(orient='records') attributes = self._create_datastore_entity(result_json) success = self._save(self.entity_id, attributes) result.set_success(success) self.logger.info(result) return result