Exemplo n.º 1
0
    def close_spider(self, spider):
        if not isinstance(spider, TripAdvisorHotelSpider):
            return

        config = GlobalConfig()
        db_path = config.get_path('OUTPUT_SQLITE')
        file_path = config.get_path('OUTPUT_BULK_JSON')
        if not db_path is None and not file_path is None:
            try:
                with TripAdvisorDB(db_path) as db:
                    data = db.get_everything()
                    with open(file_path, 'w') as fh:
                        fh.write(json.dumps(data))
            except:
                pass
 def start_requests(self):
     '''
     Este método es invocado cuando la araña empieza ha hacer las
     requests.
     :return: Devuelve un generador, que genera instancias de la
     clase scrapy.Request
     '''
     config = GlobalConfig()
     if config.is_set('SEARCH_BY_TERMS'):
         yield TripAdvisorRequests.search_hotels_by_terms(
             terms=config.get_value('SEARCH_BY_TERMS'),
             callback=self.parse_hotel_search_by_terms)
     else:
         yield TripAdvisorRequests.search_hotels_by_place(
             place=config.get_value('SEARCH_BY_LOCATION'),
             callback=self.parse_hotel_search_by_place)
    def parse_hotel(self, response):
        '''
        Parsea una request a la página con información de un hotel en TripAdvisor, obtiene sus
        datos, reviews y deals
        :param response:
        :return:
        '''
        config = GlobalConfig()

        methods = []
        methods.append(self.parse_hotel_info(response))
        if config.is_true('SCRAP_DEALS'):
            methods.append(self.parse_hotel_deals(response))
        if config.is_true('SCRAP_REVIEWS'):
            methods.append(self.parse_hotel_reviews(response))

        return chain(*methods)
Exemplo n.º 4
0
    def __init__(self, file_path):
        self.log = logging.getLogger(str(self))
        self.log.setLevel(logging.DEBUG)

        self.log.propagate = GlobalConfig().is_true('ENABLE_DEBUG') and GlobalConfig().is_true('OUTPUT_DEBUG_INFO_TO_STDOUT')

        if GlobalConfig().is_true('ENABLE_DEBUG') and not file_path is None:
            try:
                with open(file_path, 'wb') as fh:
                    pass
            except:
                pass
            try:
                log_file_handler = logging.FileHandler(file_path)
                self.log.addHandler(log_file_handler)
            except:
                pass
Exemplo n.º 5
0
def get_data():
    '''
    Una petición GET a esta ruta ("/data") devolverá información sobre los hoteles
    escrapeados en TripAdvisor.
    :return:
    '''
    try:
        with sqlite.connect(GlobalConfig().get_path('OUTPUT_SQLITE')) as db:
            cursor = db.cursor()
            cursor.execute(
                """
                SELECT id, name, address, latitude, longitude
                FROM hotel_geo AS geo INNER JOIN hotel_info as info ON geo.hotel_id = info.id 
                """)
            hotel_data = []
            for register in cursor.fetchall():
                id, name, address, latitude, longitude = register
                hotel_data.append({
                    'id' : id,
                    'name' : name,
                    'address' : address,
                    'latitude' : latitude,
                    'longitude' : longitude
                })

            cursor.execute('SELECT COUNT(*) FROM hotel_info')
            num_hotels, = cursor.fetchone()

            cursor.execute('SELECT COUNT(*) FROM hotel_deal')
            num_deals, = cursor.fetchone()

            cursor.execute('SELECT COUNT(*) FROM hotel_review')
            num_reviews, = cursor.fetchone()

            cursor.execute('SELECT COUNT(*) FROM hotel_geo')
            num_geolocalized_hotels, = cursor.fetchone()


    except Exception as e:
        hotel_data = []
        num_hotels = 0
        num_geolocalized_hotels = 0
        num_deals = 0
        num_reviews = 0

    return json.dumps({ 'hotel-data' : hotel_data,
                        'meta' : {
                            'num_hotels' : num_hotels,
                            'num_geolocalized_hotels' : num_geolocalized_hotels,
                            'num_deals' : num_deals,
                            'num_reviews' : num_reviews
                        },
                        'scraper_finished' : not TripAdvisorScraper().is_running() })
Exemplo n.º 6
0
    def open_spider(self, spider):
        if not isinstance(spider, TripAdvisorHotelSpider):
            return

        db_path = GlobalConfig().get_path('OUTPUT_SQLITE')
        try:
            if db_path is None:
                raise Exception()
            self.db = TripAdvisorDB(db_path)
            self.db.reset()
        except:
            self.db = None
Exemplo n.º 7
0
    def search_place(cls, address, callback):
        '''
        Realiza una request a la API de Google Maps para buscar una localización o lugar.
        :param address: Es un lugar o dirección
        :param callback:
        :return:
        '''
        api_key = GlobalConfig().get_value('GOOGLE_MAPS_API_KEY')
        params = {'address': address, 'key': api_key}

        url = '{}?{}'.format(cls.get_root_url(), urlencode(params))

        return Request(url=url, callback=callback)
Exemplo n.º 8
0
 def __init__(self):
     config = GlobalConfig()
     self.files = {
         'TripAdvisorHotelReview': config.get_path('OUTPUT_REVIEWS_JSON'),
         'TripAdvisorHotelInfo': config.get_path('OUTPUT_HOTEL_INFO_JSON'),
         'TripAdvisorHotelDeals': config.get_path('OUTPUT_DEALS_JSON'),
         'TripAdvisorHotelGeolocation': config.get_path('OUTPUT_GEO_JSON')
     }
Exemplo n.º 9
0
def get_json():
    '''
    Sobre la ruta "/get-json-data" se obtienen los datos escrapeados en formato JSON
    :return:
    '''
    try:
        with TripAdvisorDB(db_path = GlobalConfig().get_path('OUTPUT_SQLITE')) as db:
            data = db.get_everything()
    except:
        data = []

    response = make_response(json.dumps(data))
    response.headers['Content-Disposition'] = 'attachment; filename=tripadvisor.json'
    return response
Exemplo n.º 10
0
def get_sqlite():
    '''
    Sobre la ruta "/get-sqlite-data" se obtiene el fichero de base de datos sqlite con los
    datos escrapeados hasta el momento
    :return:
    '''
    try:
        with open(GlobalConfig().get_path('OUTPUT_SQLITE'), 'rb') as fh:
            data = fh.read()
    except:
        return 'Error fetching sqlite file database'

    response = make_response(data)
    response.headers['Content-Disposition'] = 'attachment; filename=tripadvisor.db'
    return response
Exemplo n.º 11
0
    def __init__(self, db_path):
        '''
        Inicializa la instancia. En el constructor se abre la conexión con la base de
        datos.
        '''

        Logger.__init__(self, GlobalConfig().get_path('OUTPUT_SQLITE_LOG'))

        self.log.debug('Connecting to TripAdvisor sqlite database...')
        self.db = sqlite3.connect(db_path)

        self.item_handlers = {
            'TripAdvisorHotelReview': lambda item:self.insert_item(item, 'hotel_review'),
            'TripAdvisorHotelInfo': lambda item:self.insert_item(item, 'hotel_info'),
            'TripAdvisorHotelDeals': lambda item:self.insert_item(item, 'hotel_deal'),
            'TripAdvisorHotelGeolocation': lambda item:self.insert_item(item, 'hotel_geo')
            }
    def parse_hotel_info(self, response):
        '''
        Parsea la información de un hotel en TripAdvisor
        :return:
        '''
        loader = ItemLoader(item=TripAdvisorHotelInfo(), response=response)

        loader.add_css('name', '#HEADING::text')
        loader.add_css('phone_number', 'div.phone span:not(.ui_icon)::text')
        loader.add_css(
            'amenities',
            'div.amenitiesColumn div.detailsMid div.highlightedAmenity::text',
            re='^[ ]*(.+)[ ]*$')

        loader.add_css('address',
                       'div.address span.street-address::text',
                       re='^[ ]*(.+)[ ]*$')
        loader.add_css('address',
                       'div.address span.locality::text',
                       re='^[ ]*(.+),[ ]*$')
        loader.add_css('address',
                       'div.address span.country-name::text',
                       re='^[ ]*(.+)[ ]*$')

        hasher = sha256()
        hasher.update(response.url.encode())
        loader.add_value('id', hasher.hexdigest())

        item = loader.load_item()

        self.log.debug('Succesfully info extracted from "{}" hotel'.format(
            loader.item['name']))

        yield item

        geo_request = GMapRequests.search_place(
            address=item.get('address'), callback=self.parse_hotel_geolocation)
        geo_request.meta['hotel_id'] = item.get('id')

        if GlobalConfig().is_true('SCRAP_GEO'):
            yield geo_request
    def __init__(self, **kwargs):
        '''
        Inicializa esta instancia.
        :param terms: Es un parámetro opcional que indica los términos de busqueda para
        encontrar hoteles en tripadvisor.
        :param locations: Es un parámetro opcional que indica una localización para encontrar
        hoteles en tripadvisor e.g: "Olite, Navarra" o "Spain"

        Si terms no es None, se escrapearán los hoteles que se encuentren realizando una búsqueda
        por términos.
        Si terms es None, se escrapearán los hoteles que se encuentren realizando una búsqueda
        por localización.
        '''
        Spider.__init__(self)

        config = GlobalConfig()
        self.log = Logger(config.get_path('OUTPUT_SCRAP_LOG'))

        config.override(Config(kwargs))
        config.check()
Exemplo n.º 14
0
 def init(self):
     GlobalConfig().override(Config.load_from_file(join(dirname(__file__), 'scraper.conf.py')))
 def from_crawler(cls, crawler, *args, **kwargs):
     GlobalConfig().override(Config(crawler.settings))
     return cls(*args, **kwargs)