def fetch_reviews_from_postgres(with_categories): global language db = SqlHelper(constants.SQL_DATABASE_NAME) session = db.get_connection() if with_categories: query = 'SELECT r.rating, r.text, r.language, zip.zip_code, city.name as city, fc.name as category ' \ 'FROM review AS r ' \ 'JOIN restaurant AS rest ' \ 'ON (r.restaurant_id = rest.id) ' \ 'JOIN zip_code AS zip ' \ 'ON (rest.zip_code = zip.zip_code) ' \ 'JOIN city ' \ 'ON (zip.city_id = city.id) ' \ 'JOIN food_category AS fc ' \ 'ON (r.restaurant_id = fc.restaurant_id);' else: query = 'SELECT r.rating, r.text, r.language, zip.zip_code, city.name as city ' \ 'FROM review AS r ' \ 'JOIN restaurant AS rest ' \ 'ON (r.restaurant_id = rest.id) ' \ 'JOIN zip_code AS zip ' \ 'ON (rest.zip_code = zip.zip_code) ' \ 'JOIN city ' \ 'ON (zip.city_id = city.id);' df = pd.read_sql_query(sql=query, con=session) logger.info('Found {0} Reviews in {1}'.format(df.shape[0], language)) return df
def _fetch_top_city_from(self, top_how_much, table_name): db = SqlHelper(constants.SQL_DATABASE_NAME) db.create_session() df = db.fetch_table_as_dataframe(table_name) self.logger.info('Fetching Top {0}'.format(top_how_much)) cities_dataframe = pd.DataFrame(data=df.iloc[:top_how_much], columns={'city'}) return cities_dataframe['city'].values.tolist()
def run(self): db = SqlHelper(constants.SQL_DATABASE_NAME) db.create_session() if self.top_how_much is not None: df = db.fetch_table_as_dataframe('top_cities') cities_dataframe = pd.DataFrame(data=df.iloc[:self.top_how_much], columns={'city', 'state'}) cities = cities_dataframe.values.tolist() print(cities) elif self.city_name is None: city_objects = db.fetch_entity_where('TopCities') cities = [[city.state, city.city] for city in city_objects] else: city_objects = db.fetch_entity_where('TopCities', True, False, city=self.city_name) cities = [[city.state, city.city] for city in city_objects] all_results = [] try: for city in cities: self.current_city = city[1] self.current_state = city[0] result = { 'city': self.current_city, 'total': None, 'restaurants': [] } self.logger.info('Starting to scrape {0}'.format( self.current_city)) spider = SpeisekarteSpider(self.current_city) spider.run() spider_result = spider.get_result() success = spider_result.get_success() if success and not self.test_mode: data = spider_result.get_data() restaurants = data['restaurants'] total = data['total'] result['total'] = total for restaurant in restaurants: restaurant_id = restaurant['id'] entity_id = self.current_city + '$' + restaurant_id datastore_entity = self._create_datastore_entity( restaurant) success = self._save(entity_id, datastore_entity) restaurant_result = { 'success': success, 'content': restaurant_id } result['restaurants'].append(restaurant_result) all_results.append(result) except HTTPError as error: self.logger.exception( 'Encountered HTTP error %s on %s:\nAbort program.', error.code, error.url) except: self.logger.exception('Something went wrong') finally: db.close_session()
def reset_postgres(): sql = SqlHelper(constants.SQL_DATABASE_NAME) sql.create_session() sql_entities = sql.fetch_entity_where('ZipCode', True) for sql_entity in sql_entities: sql_entity.requested = False sql.commit_session() sql.close_session()
def __init__(self, database, source_entity, test_mode, city_name): super(Transporter, self).__init__() self.logger.info( 'Creating Transporter for Datastore Entity: {0}'.format( source_entity)) self.database = database self.source_entity = source_entity self.source_db = DatastoreHelper() self.target_db = SqlHelper(self.database) self.test_mode = test_mode self.city_name = city_name if self.city_name: self._fetch_zip_codes_from_database()
def run(self): db = SqlHelper(constants.SQL_DATABASE_NAME) yelp_helper = YelpHelper() db.create_session() if self.city_name is None: cities = db.fetch_all(constants.SQL_TABLE_CITY) else: cities = db.fetch_entity_where('City', True, False, name=self.city_name) try: for city in cities: name = city.name for zip_code in city.zip_codes: if not zip_code.requested: zip_completed = True self.location = str(zip_code.zip_code) + ', ' + str(name) + ', Deutschland' self.offset = 0 content, status_code = yelp_helper.get_search(self.location, self.offset) if 'error' not in content and not self.test_mode: total = content['total'] entity_id = str(self.current_path) + str(self.location) + str(self.offset) datastore_entity = self._create_datastore_entity(content) save_success = self._save(entity_id, datastore_entity) if save_success is False: zip_completed = False self.logger.info(u'Found {0} Entries...'.format(total)) while self.offset < total \ and (self.offset + constants.YELP_SEARCH_LIMIT <= 1000) \ and save_success is True: content = yelp_helper.get_search(self.location, self.offset) self.offset += constants.YELP_SEARCH_LIMIT + 1 if 'error' not in content: entity_id = str(self.current_path) + str(self.location) + str(self.offset) datastore_entity = self._create_datastore_entity(content) save_success = self._save(entity_id, datastore_entity) if save_success is False: zip_completed = False else: raise YelpError(content['error']['code'], content['error']['description']) else: raise YelpError(content['error']['code'], content['error']['description']) if zip_completed is True: zip_code.requested = True db.commit_session() except HTTPError as error: self.logger.exception('Encountered HTTP error %s on %s:\nAbort program.', error.code, error.url) except YelpError as err: self.logger.exception(err) finally: db.close_session()
def check_city(city_name): logger.info('Checking if city is available in PostgreSQL...') sql = SqlHelper(constants.SQL_DATABASE_NAME) sql.create_session() city_from_db = sql.fetch_city_by_name(city_name) while city_from_db is None: city_name = str( input("City {0} not available in database. Try again!".format( city_name))) city_from_db = sql.fetch_city_by_name(city_name) sql.close_session() return city_name
def _fetch_zip_codes_from_database(self): sql = SqlHelper(constants.SQL_DATABASE_NAME) sql.create_session() city_from_db = sql.fetch_city_by_name(self.city_name) # get zip codes and close session afterwards zip_codes = city_from_db.zip_codes sql.close_session() for zip_code_obj in zip_codes: self.zip_codes.append(zip_code_obj.zip_code)
def fetch_zip_codes_from_database(): global city_string sql = SqlHelper(constants.SQL_DATABASE_NAME) sql.create_session() city_from_db = sql.fetch_city_by_name(city_string) while city_from_db is None: city_string = str( input("City {0} not available in database. Try again!".format( city_string))) city_from_db = sql.fetch_city_by_name(city_string) # get zip codes and close session afterwards zip_codes = city_from_db.zip_codes sql.close_session() return zip_codes
def main(): util.setup_logging() db = SqlHelper(constants.SQL_DATABASE_NAME) yelp = YelpHelper() db.create_session() result = db.fetch_entity_where('Restaurant', True, False, review_count=0) logger.info('Found {0} Restaurants with 0 Review Count'.format( len(result))) for restaurant in result: logger.info('Old Review Count Value: {0}'.format( restaurant.review_count)) name = restaurant.name business_id = restaurant.id logger.info(name) result, status_code = yelp.get_business(business_id, 0) status_codes = [403, 404] if status_code not in status_codes: if 'error' not in result: review_count = result['review_count'] if review_count is not None: restaurant.review_count = review_count logger.info('New Review Count Value: {0}'.format( restaurant.review_count)) else: logger.error('{0}: {1}'.format(result['error']['code'], result['error']['description'])) break else: restaurant.review_count = 0 logger.info('New Review Count Value: {0}'.format( restaurant.review_count)) db.commit_session() db.close_session()
def fill_buying_power_calculated_table(): util.setup_logging() db = SqlHelper(constants.SQL_DATABASE_NAME) db.create_session() cities_without_buying_power = db.fetch_entity_where('City', True, buying_power=None) logger.info('Found {0} cities without Buying Power'.format(len(cities_without_buying_power))) buying_power_average = get_germany_buying_power_average() logger.info('Buying Power Germany: {0}'.format(buying_power_average)) for city_without_buying_power in cities_without_buying_power: buying_power_calculated = BuyingPowerCalculated() city_id = city_without_buying_power.id buying_power_calculated.city_id = city_id buying_power_calculated.buying_power = buying_power_average db.insert(buying_power_calculated) db.commit_session() db.close_session()
def fill_price_range_calculated_table(): util.setup_logging() city_mode_list = {} db = SqlHelper(constants.SQL_DATABASE_NAME) db.create_session() restaurants_without_price = db.fetch_entity_where('Restaurant', True, price_range=None) logger.info('Found {0} restaurants without Price'.format(len(restaurants_without_price))) for restaurant_without_price in restaurants_without_price: price_range_calculated = PriceRangeCalculated() restaurant_id = restaurant_without_price.id price_range_calculated.restaurant_id = restaurant_id city_name = restaurant_without_price.city if city_name: if city_name not in city_mode_list: logger.info('Calculating mode for {0}'.format(city_name)) price_range_list = [] restaurants_of_city = db.fetch_entity_where('Restaurant', True, city=city_name) logger.info('Found {0} restaurants for {1}'.format(len(restaurants_of_city), city_name)) for restaurant_of_city in restaurants_of_city: price_range = restaurant_of_city.price_range if price_range: price_range_list.append(price_range) if len(price_range_list) > 0: try: price_range_mode = mode(price_range_list) logger.info('Mode for {0}: {1}'.format(city_name, price_range_mode)) except StatisticsError: price_range_mode = '-1' logger.info('Multiple modes found for {0}'.format(city_name)) else: price_range_mode = '-2' logger.info('No price_range attribute found in {0}'.format(city_name)) city_mode_list[city_name] = price_range_mode else: price_range_mode = city_mode_list[city_name] logger.info('Found price_range {0} for {1}'.format(price_range_mode, city_name)) price_range_calculated.price_range = price_range_mode db.insert(price_range_calculated) logger.info('Calculated {0} price_range mode(s)'.format(len(city_mode_list))) db.commit_session() db.close_session()
import pandas as pd def calc(price, priceintervaltype, totalfloorspace, seats): if 'ONE_TIME_CHARGE' in str(priceintervaltype): multiplier = constants.BUY_FACTOR else: multiplier = constants.RENT_FACTOR rest_budget = constants.BUDGET - price * multiplier - constants.FURNISH_COST_PER_SQUARE_METER * totalfloorspace - seats * constants.FURNISH_COST_PER_SEAT return rest_budget if __name__ == '__main__': from main.helper.db_helper import SqlHelper db = SqlHelper(constants.SQL_DATABASE_NAME) session = db.get_connection() immo_df = pd.read_sql_table(table_name=constants.SQL_TABLE_IMMOSCOUT, con=session) for index, row in immo_df.iterrows(): print(str(index + 1) + ". " + row['city']) # Zero price means, you have to ask the advertiser filter_price_zero = immo_df[immo_df['price'] >= 0] # Calcualting min/max rest_budget for seats: 52 (100 %) to 65 (125 %) result = filter_price_zero.assign(min = lambda x: calc(x['price'], x['priceintervaltype'], x['totalfloorspace'], constants.SEATS_MAX), max = lambda x: calc(x['price'], x['priceintervaltype'], x['totalfloorspace'], constants.SEATS_MIN)) result = result[result['min'] >= 0] print('city: ' + str(result['city']) + ' min_rest_budget: ' + str(result['min']) +
from config import constants from main.helper import util from main.helper.db_helper import DatastoreHelper, SqlHelper import pandas as pd from main.helper.text_analyzer import TextAnalyzer datastore = DatastoreHelper() sql = SqlHelper(constants.SQL_DATABASE_NAME) sql.create_session() # df = sql.fetch_table_as_dataframe('top_cities') # cities = pd.DataFrame(data=df.iloc[0:10], columns={'city', 'state'}) # cities = cities.values.tolist() # city_objects = sql.fetch_entity_where('TopCities') # cities = [[city.state, city.city] for city in city_objects] # print(cities[1]) # print(cities[0]) analyzer = TextAnalyzer('english', True, False, '../../data/tree_tagger') review = 'Yes, the 5 stars are deserved: here you can drink and buy the best coffee in Bochum (and maybe in the Ruhr area?). I usually only take the roasted coffee' text = analyzer.text_process(review) print(text) # # menu_item_improved = util.convert_list_to_string(analyzer.text_process(text)) # if menu_item_improved:
def run(self): result = Result() result.set_success(True) sql = SqlHelper(constants.SQL_DATABASE_NAME) sql.create_session() if self.current_city: self.cities.append(self.current_city) elif self.top_how_much: self.cities = self._fetch_top_city_from(self.top_how_much, 'top_cities') restaurants = self._fetch_all_restaurants() locale_list = ['en_US', 'de_DE'] if restaurants: for restaurant in restaurants: self.current_restaurant_id = restaurant.id # if there is a change in zip codes; # all reviews from current zip code are successfully collected into db # set it to collected self.logger.debug('Restaurant Zip: {0}'.format( restaurant.zip_code)) if self.current_zip_code and restaurant.zip_code is not self.current_zip_code: sql.update_entity('ZipCode', 'zip_code', str(self.current_zip_code), 'review_collected', True) self.current_zip_code = restaurant.zip_code self.current_city = restaurant.city for locale in locale_list: self.current_locale = locale yelp_entity, status_code = self.yelp.get_reviews( self.current_restaurant_id, locale) if not status_code == 403: if 'error' not in yelp_entity: reviews = yelp_entity['reviews'] if len(reviews) > 0: datastore_entity = self._create_datastore_entity( yelp_entity) entity_id = self.current_city + '@' + \ str(self.current_zip_code) + '@' + \ str(self.current_restaurant_id) + '@' + \ locale if not self.test_mode: success = self._save( entity_id, datastore_entity) if success: result.set_success(success) sql.commit_session() else: self.logger.debug( 'No Reviews found for restaurant {0} in {1}' .format(restaurant.name, self.current_city)) else: message = yelp_entity['error']['description'] result.set_success(False) result.set_message(message) raise YelpError(yelp_entity['error']['code'], message) else: self.logger.debug( 'No Reviews found for restaurant {0} in {1}'. format(restaurant.name, self.current_city)) else: result.set_success(False) result.set_message( 'Failure when saving Review Entity to Datastore') else: result.set_success(True) result.set_message( 'No Restaurants left to collect reviews from! Nice Job') sql.close_session() self.logger.info(result) return result
from main.helper.db_helper import SqlHelper db = SqlHelper('fonethd') db.create_session() entries = db.fetch_all('city') for row in entries: zip_codes = row.zip_codes name = row.name print('Zip Codes for' + str(name)) for zip_code in zip_codes: print(zip_code.zip_code)
class Transporter(ABC, threading.Thread): database = None source_entity = None target_entity = None source_db = None test_mode = None source_entity_id = None city_name = None zip_codes = [] top_how_much = None logger = logging.getLogger(__name__) def __init__(self, database, source_entity, test_mode, city_name): super(Transporter, self).__init__() self.logger.info( 'Creating Transporter for Datastore Entity: {0}'.format( source_entity)) self.database = database self.source_entity = source_entity self.source_db = DatastoreHelper() self.target_db = SqlHelper(self.database) self.test_mode = test_mode self.city_name = city_name if self.city_name: self._fetch_zip_codes_from_database() def run(self): results = [] self.logger.info('Starting transport...') self.target_db.create_session() total = self._get_entities(None, None, True) self.logger.info('Found a total of %s entries in Google Datastore', str(total)) offset = 0 while offset < total: result = self._transport(offset) results.append(result) offset += constants.GCP_FETCH_LIMIT # i dont know why but google datastore doesn't allow a offset greater than 2500 if offset == 2500: self.logger.info('Resetting offset...') offset = 0 total = self._get_entities(None, None, True) self.logger.info( 'Found a total of %s entries in Google Datastore', str(total)) for result in results: self.logger.info(result) self.logger.info('Done transporting Restaurants...') def _transport(self, offset): result = Result() limit = constants.GCP_FETCH_LIMIT source_entities = self._get_entities(limit, offset, False) if source_entities: for datastore_entity in source_entities: self.logger.info('Starting mapping...') entities = self.map(datastore_entity) entity_length = len(entities) self.logger.info('Mapped {0} entities...'.format( str(entity_length))) if not self.test_mode: if entity_length > 0: try: for entity in entities: if entity: self.logger.info('Saving in database...') self.target_db.insert(entity) self.logger.info('Commiting DB entries') self.target_db.commit_session() result.set_success(True) result.set_message( 'Fetched entries from offset {0} with limit {1}' .format(str(offset), str(limit))) except SQLAlchemyError as err: result.set_success(False) result.set_message(err.code) self.logger.exception('An SQLAlchemyError occured') finally: self.target_db.close_session() else: result.set_success(True) result.set_message( 'There are no mapped entities that can be saved in database' ) self.source_db.set_transported(datastore_entity, True) else: result.set_success(True) result.set_message('Test Mode active') else: result.set_success(False) result.set_message(self.source_entity + ' could not be found in Google Datastore') return result def _fetch_zip_codes_from_database(self): sql = SqlHelper(constants.SQL_DATABASE_NAME) sql.create_session() city_from_db = sql.fetch_city_by_name(self.city_name) # get zip codes and close session afterwards zip_codes = city_from_db.zip_codes sql.close_session() for zip_code_obj in zip_codes: self.zip_codes.append(zip_code_obj.zip_code) def _fetch_entities_by_zip_code(self, entity_name, limit, offset, only_keys): result_all = [] for zip_code in self.zip_codes: result = self.source_db.fetch_entity(entity_name, limit=limit, offset=offset, only_keys=only_keys, operator='=', zip_code=str(zip_code), transported=False) result_all += result return result_all def _get_entities(self, limit, offset, only_total): if not self.zip_codes: content = self.source_db.fetch_entity(self.source_entity, limit, offset, only_total, '=', transported=False) else: content = self._fetch_entities_by_zip_code(self.source_entity, limit, offset, only_total) if only_total: result = len(content) else: result = content return result # maps target and source structure and returns a list of entities to save in db @abstractmethod def map(self, datastore_entity) -> List: pass
def main(): rents = [] util.setup_logging() db = SqlHelper(constants.SQL_DATABASE_NAME) db.create_session() result = db.fetch_entity_where('City', True, True, rent_avg=None) for city in result: rents.append(city.rent_avg) rent_avg = int(numpy.mean(rents)) result = db.fetch_entity_where('City', True, False, rent_avg=None) for city in result: rent_avg_calculated = RentAvgCalculated() rent_avg_calculated.city_id = city.id rent_avg_calculated.rent_avg = rent_avg db.insert(rent_avg_calculated) db.commit_session() db.close_session()
def run(self): result = Result() db = SqlHelper(constants.SQL_DATABASE_NAME) df = db.fetch_table_as_dataframe('top_cities') cities = pd.DataFrame(data=df.iloc[0:self.top_how_much], columns={'city'}) for index, row in cities.iterrows(): self.logger.debug(str(index + 1) + ". " + row['city']) # cities = {'city': ['Heidelberg', 'Karlsruhe']} city_for_search = pd.DataFrame(cities, columns=['city']) immo_oauth = OAuth1( constants.IMMOSCOUT_CLIENT_KEY, client_secret=constants.IMMOSCOUT_CLIENT_SECRET, resource_owner_key=constants.IMMOSCOUT_RESOURCE_OWNER_KEY, resource_owner_secret=constants.IMMOSCOUT_RESOURCE_OWNER_SECRET) # create empty geo_df geo_df = pd.DataFrame(columns={'geoId', 'city'}) # get geoid from Immoscout24 API geo_df = self.get_geo_id(city_for_search, geo_df, immo_oauth) # Fläche Retaurant: # https: // se909eeccf1caa559.jimcontent.com / download / version / 1507517357 / module / 11096440527 / name / AuszugDiplomarbeit_13.03.2006.pdf # Gast = 40 % # Technik = 12 % # Personal = 8 % # Gast = 40 % total_floor_space_min = constants.FLOOR_SPACE_GUEST * constants.SEATS_MIN / 40 * 100.0 total_floor_space_max = constants.FLOOR_SPACE_GUEST * constants.SEATS_MAX / 40 * 100.0 restaurant_df = pd.DataFrame() # get Immoscout24 object by geocode for index, row in geo_df.iterrows(): params = { 'realestatetype': 'gastronomy', 'geocodes': str(row['geoId']), 'gastronomytypes': 'restaurant', 'channel': 'is24', 'numberofseats': str(constants.SEATS_MIN) + '-' + str(constants.SEATS_MAX), 'pagesize': '200', 'totalfloorspace': str(total_floor_space_min) + '-' + str(total_floor_space_max) } immo_search_response = requests.request( method='GET', url=constants.IMMOSCOUT_SEARCH_URL, params=params, headers=constants.IMMOSCOUT_HEADERS, auth=immo_oauth) immo_search_json = pd.read_json(immo_search_response.text) hits = immo_search_json['resultlist.resultlist'][0]['numberOfHits'] self.logger.info("Hits: " + str(hits) + " for city: " + str(row['city']) + "\r\n") if hits == 1: immo_object = immo_search_json['resultlist.resultlist'][1][0][ 'resultlistEntry']['resultlist.realEstate'] real_estate_id = immo_search_json['resultlist.resultlist'][1][ 0]['resultlistEntry']['resultlist.realEstate']['@id'] restaurant_df = restaurant_df.append(self.transform_df( immo_object, real_estate_id), ignore_index=True, sort=True) elif hits >= 1: for i in range(hits): immo_object = immo_search_json['resultlist.resultlist'][1][ 0]['resultlistEntry'][i]['resultlist.realEstate'] real_estate_id = immo_search_json['resultlist.resultlist'][ 1][0]['resultlistEntry'][i]['resultlist.realEstate'][ '@id'] restaurant_df = restaurant_df.append(self.transform_df( immo_object, real_estate_id), ignore_index=True, sort=True) else: self.logger.info('No object found for city: ' + str(row['city'])) self.logger.info(restaurant_df) result_json = restaurant_df.to_json(orient='records') attributes = self._create_datastore_entity(result_json) success = self._save(self.entity_id, attributes) result.set_success(success) self.logger.info(result) return result
id = Column(Integer, primary_key=True, autoincrement=False) title = Column(String) city = Column(String) quarter = Column(String) postcode = Column(Integer) price = Column(Integer) currency = Column(String) marketingtype = Column(String) priceintervaltype = Column(String) totalfloorspace = Column(Numeric) def __str__(self): return 'id: {0}, name: {1}, updated at: {2} immoscout: {3}' \ .format(self.id, self.name, self.updated_at, self.city) class TopCities(Base): __tablename__ = constants.SQL_TABLE_TOP_CITY city = Column(String, primary_key=True, autoincrement=False) state = Column(String) potential = Column(Numeric) if __name__ == '__main__': from main.helper.db_helper import SqlHelper db = SqlHelper(constants.SQL_DATABASE_NAME) engine = db.get_connection() Base.metadata.create_all(engine)
def check_price_range_availability_and_update(): yelp_helper = YelpHelper() restaurants = [] util.setup_logging() not_available_count = 0 db = SqlHelper(constants.SQL_DATABASE_NAME) db.create_session() result = db.fetch_entity_where('Restaurant', True, price_range=None) logger.info('Found {0} restaurants'.format(str(len(result)))) try: for restaurant in result: restaurant_id = restaurant.id business, status_code = yelp_helper.get_business(restaurant_id) if 'error' not in business: if 'price' in business: price_range = business['price'] if price_range: restaurant.price_range = price_range else: logger.info('Price Range is null') db.insert(restaurant) else: not_available_count += 1 else: raise YelpError(business['error']['code'], business['error']['description']) logger.info(not_available_count) except YelpError as error: logger.exception(error) logger.info('Adding {0} updated restaurants to DB...'.format(len(restaurants))) finally: db.commit_session() db.close_session()