def parse(self, response): print 'params', self.params if self.params is not None: params_encoded = urllib.urlencode(self.params) request_solr = urllib2.Request(self.solr_url + '?' + params_encoded) try: response_solr = urllib2.urlopen(request_solr) except: response_solr = None error_msg = 'params ' + str(self.params) error_msg += "Unexpected error: " + str( sys.exc_info()[0]) + ' ' + str(sys.exc_info()[1]) print error_msg if response_solr is not None: all_data = response_solr.read() data = json.loads(all_data) response_list = data['response']['docs'] url_list = [{ 'id': solr_item['id'], 'url': solr_item['url'] } for solr_item in response_list] for url_item in url_list: anuncio_id = url_item['id'] url = url_item['url'] anuncio = ChileautosItem() anuncio['id'] = anuncio_id anuncio['url'] = url yield scrapy.Request(url, callback=self.parse_anuncio, meta={'anuncio': anuncio})
def parse(self, response): print 'params', self.params if self.params is not None: params_encoded = urllib.urlencode(self.params) request_solr = urllib2.Request(self.solr_url + '?' + params_encoded) try: response_solr = urllib2.urlopen(request_solr) except: response_solr = None error_msg = 'params ' + str(self.params) error_msg += "Unexpected error: " + str( sys.exc_info()[0]) + ' ' + str(sys.exc_info()[1]) print error_msg if response_solr is not None: all_data = response_solr.read() data = json.loads(all_data) response_list = data['response']['docs'] for item in response_list: anuncio_id = item['id'] if 'id' in item else None version = item['version'] if 'version' in item else None marca = item['marca'] if 'marca' in item else None modelo = item['modelo'] if 'modelo' in item else None ano = item['ano'] if 'ano' in item else None puertas = item['puertas'] if 'puertas' in item else None version_sii = self.get_version_sii(anuncio_id, version, marca, modelo, ano, puertas) if version_sii is None: print "sin version SII" anuncio = ChileautosItem() anuncio['id'] = anuncio_id anuncio['version_sii'] = {'set': 'Otro'} yield anuncio else: print "version SII:" + version_sii + "\n" anuncio = ChileautosItem() anuncio['id'] = anuncio_id anuncio['version_sii'] = {'set': version_sii} yield anuncio
def parse_node(self, response, node): url = ''.join(node.xpath('url/text()').extract()) anuncio_id = ''.join(node.xpath('id/text()').extract()) self.log('url: %s' % url) anuncio = ChileautosItem() anuncio['id'] = "amotor_" + anuncio_id anuncio['url'] = url # anuncio['vendido'] = {'add': 'NOW'} ''' Carga de datos ''' anuncio['marca'] = ''.join(node.xpath('marcatexto/text()').extract()) anuncio['modelo'] = ''.join(node.xpath('modelotexto/text()').extract()) anuncio['ano'] = ''.join(node.xpath('year/text()').extract()) anuncio['version_det'] = ''.join( node.xpath('version/text()').extract()) anuncio['precio_det'] = ''.join(node.xpath('precio/text()').extract()) anuncio['comentarios'] = ''.join( node.xpath('descripcion/text()').extract()) anuncio['fecha_publicacion'] = ''.join( node.xpath('date/text()').extract()) anuncio['kilometros_det'] = ''.join(node.xpath('kms/text()').extract()) anuncio['header_nombre'] = ''.join( node.xpath('title/text()').extract()) #anuncio['categoria'] = ''.join(node.xpath('tipo/text()').extract()) #anuncio['carroceria'] = ''.join(node.xpath('url/text()').extract()) anuncio['combustible_det'] = ''.join( node.xpath('combustible/text()').extract()) anuncio['region_det'] = ''.join(node.xpath('region/text()').extract()) #anuncio['ciudad_det'] = ''.join(node.xpath('url/text()').extract()) anuncio['transmision_det'] = ''.join( node.xpath('transmision/text()').extract()) anuncio['tipo_anuncio'] = ''.join( node.xpath('tipo/text()').extract()).strip() anuncio['vendido'] = None ''' Fin carga de datos ''' yield anuncio
def parse_thumb(self, response): hxs = scrapy.Selector(response) url = response.url m = re.search('(?!\=)\d+(?=\&)', url) anuncio_id = m.group(0) self.log('url: %s' % url) fields = hxs.xpath("//section[@class='box da-wrapper']") anuncio = ChileautosItem() if not fields: url = response.url anuncio['id'] = "yapo_" + anuncio_id anuncio['url'] = url anuncio['vendido'] = {'add': 'NOW'} yield anuncio else: for field in fields: ''' Carga de datos generales ''' pattern = re.compile( r'((?=\[)\[[^]]*\]|(?=\{)\{[^\}]*\}|\"[^"]*\")', re.MULTILINE | re.DOTALL) data = field.xpath( '//script[contains(., "var utag_data =")]/text()').re( pattern)[0] py_obj = demjson.decode(data) data_obj = json.dumps(py_obj) decoded = json.loads(data_obj) if "model" in decoded and "year" in decoded and "brand" in decoded: if "ad_id" in decoded: anuncio['id'] = "yapo_" + decoded['ad_id'] if "brand" in decoded: anuncio['marca'] = decoded['brand'] if "model" in decoded: anuncio['modelo'] = decoded['model'] if "year" in decoded: anuncio['ano'] = decoded["year"] if "version" in decoded: anuncio['version_det'] = decoded["version"] else: anuncio['version_det'] = None if "price" in decoded: anuncio['precio_det'] = decoded["price"] if "description" in decoded: anuncio['comentarios'] = decoded["description"] if "publish_date" in decoded: anuncio['fecha_publicacion'] = decoded["publish_date"] if "km" in decoded: anuncio['kilometros_det'] = decoded["km"] if "ad_title" in decoded: anuncio['header_nombre'] = decoded["ad_title"] if "category_level2" in decoded: anuncio['categoria'] = decoded["category_level2"] if "car_type" in decoded: anuncio['carroceria'] = decoded["car_type"] if "fuel" in decoded: anuncio['combustible_det'] = decoded["fuel"] if "region_level2" in decoded: anuncio['region_det'] = decoded["region_level2"] if "region_level3" in decoded: anuncio['ciudad_det'] = decoded["region_level3"] if "transmission" in decoded: anuncio['transmision_det'] = decoded["transmission"] if anuncio['id'] is None: anuncio['id'] = "yapo_" + anuncio_id anuncio['url'] = url anuncio['tipo_anuncio'] = ''.join( field.xpath( '//p[@class="name"]/text()').extract()).strip() anuncio['vendido'] = None yield anuncio else: print 'anuncio sin marca, modelo o año'
def process_item(self, item): patente = 'caam00' id_vehiculo = 101010 id_categoria = 101010 id_sucursal = 101010 id_marca = None id_modelo = None id_transmision = None id_combustible = None id_direccion = None int_cilindrada = None id_techo = 0 id_color = 0 id_segmento = 0 id_tapiz = 0 anuncio_id = item['id'] if 'id' in item else None marca = item['marca'] if 'marca' in item else None modelo = item['modelo'] if 'modelo' in item else None chasis = item['carroceria'] if 'carroceria' in item else None sucursal = item['contact_seller'] if 'contact_seller' in item else None transmision = item[ 'transmision_det'] if 'transmision_det' in item else None combustible = item[ 'combustible_det'] if 'combustible_det' in item else None categoria = item[ 'tipo_categoria_det'] if 'tipo_categoria_det' in item else None color = item[ 'color_exterior_det'] if 'color_exterior_det' in item else None direccion = item['eq_direccion'] if 'eq_direccion' in item else None techo = item['eq_techo'] if 'eq_techo' in item else None cilindrada = item['eq_cilindrada'] if 'eq_cilindrada' in item else None if cilindrada is not "" and cilindrada is not u'' and cilindrada is not '' and cilindrada is not None: int_cilindrada = int(cilindrada) try: if marca: id_marca = self.get_select_one( 'Marca', 'idmarcas', "descripcion = UPPER('%s')" % marca) if id_marca: id_modelo = self.get_select_one( 'Modelo', 'idmodelo', "idmarcas = %s.0 AND descripcion = UPPER('%s')" % (id_marca, modelo)) if transmision: id_transmision = self.get_select_one( 'Transmision', 'idtransmision', "descripcion = '%s'" % transmision) if direccion: id_direccion = self.get_select_one( 'Direccion', 'Id_Direccion', "Descripcion = '%s'" % direccion) if techo: id_techo = self.get_select_one('Techo', 'idtecho', "descripcion = '%s'" % techo) if combustible: id_combustible = self.get_select_one( 'Combustible', 'idcombustible', "descripcion = '%s'" % combustible) if color: id_color = self.get_select_one('Color', 'Id_Color', "Descripcion = '%s'" % color) if chasis: id_segmento = self.get_select_one( 'Segmento', 'Id_Segmento', "Descripcion = '%s'" % chasis) except: print "Error:", traceback.format_exc() anuncio = ChileautosItem() anuncio['id'] = anuncio_id anuncio['idvehiculo_amotor'] = {'set': id_vehiculo} anuncio['idvehiculo2_amotor'] = {'set': id_vehiculo} anuncio['patente_amotor'] = {'set': patente} anuncio['IdCategoria_amotor'] = {'set': id_categoria} anuncio['Categoria_amotor'] = {'set': categoria} anuncio['Idsucursal_amotor'] = {'set': id_sucursal} anuncio['sucursal_amotor'] = {'set': sucursal} anuncio['Idmarca_amotor'] = {'set': id_marca} anuncio['Idmodelo_amotor'] = {'set': id_modelo} anuncio['transmision_amotor'] = {'set': id_transmision} anuncio['combustible_amotor'] = {'set': id_combustible} anuncio['direccion_amotor'] = {'set': id_direccion} anuncio['techo_amotor'] = {'set': id_techo} anuncio['cilindrada_amotor'] = {'set': int_cilindrada} anuncio['color_amotor'] = {'set': id_color} anuncio['segmento_amotor'] = {'set': id_segmento} anuncio['tapiz_amotor'] = {'set': id_tapiz} return anuncio
def parse(self, response): hxs = scrapy.Selector(response) fields = hxs.xpath("//div[@class='l-content__details-main col-xs-12 col-sm-8']") anuncio = ChileautosItem() if not fields: url = response.url anuncio['id'] = url.replace("https://www.chileautos.cl/auto/usado/details/CL-AD-", "") anuncio['url'] = response.url anuncio['vendido'] = {'add': 'NOW'} else: for field in fields: ''' Carga de datos generales ''' anuncio['vendido'] = None url = response.url anuncio['id'] = url.replace("https://www.chileautos.cl/auto/usado/details/CL-AD-", "") anuncio['url'] = response.url anuncio['header_nombre'] = ''.join(field.xpath('h1/text()').extract()).strip() anuncio['fecha_publicacion'] = {'add': ''.join( field.xpath('//div[@class="published-date"]/span/text()').extract()).strip()} anuncio['precio_det'] = ''.join( field.xpath('//h3[@class="key-features__price hidden-xs"]/text()').extract()).strip() anuncio['kilometros_det'] = ''.join(field.xpath( '//i[@class="csn-icons csn-icons-odometer"]/following-sibling::text()[1]').extract()).strip() anuncio['categoria'] = ''.join(field.xpath( '//i[@class="csn-icons csn-icons-garage"]/following-sibling::text()[1]').extract()).strip() anuncio['carroceria'] = ''.join(field.xpath( '//i[@class="csn-icons csn-icons-body"]/following-sibling::text()[1]').extract()).strip() anuncio['region'] = ''.join(field.xpath( '//i[@class="zmdi zmdi-pin"]/following-sibling::text()[1]').extract()).strip() anuncio['comentarios'] = ''.join(field.xpath( '//div[@class="car-comments col-xs-12"]/p/text()').extract()).strip() ''' Carga de detalles destacados ''' anuncio['vehiculo_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="' + unicode('Vehículo', 'utf-8') + '"]/td/text()').extract()).strip() anuncio['precio_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="Precio"]/td/text()').extract()).strip() anuncio['kilometros_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="Kilometraje"]/td/text()').extract()).strip() anuncio['color_exterior_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="Color Exterior"]/td/text()').extract()).strip() anuncio['transmision_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="' + unicode('Transmisión', 'utf-8') + '"]/td/text()').extract()).strip() anuncio['puertas_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="Puertas"]/td/text()').extract()).strip() anuncio['pasajeros_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="Pasajeros"]/td/text()').extract()).strip() anuncio['combustible_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="Combustible"]/td/text()').extract()).strip() anuncio['consumo_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="Consumo de combustible (combinado)"]/td/text()').extract()).strip() anuncio['region_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="' + unicode('Región', 'utf-8') + '"]/td/text()').extract()).strip() anuncio['ciudad_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="Ciudad"]/td/text()').extract()).strip() anuncio['version_det'] = ''.join(field.xpath( '//table/tr[th/text()="' + unicode('Versión', 'utf-8') + '"]/td/text()[1]').extract()).strip() pattern = re.compile(r'((?=\[)\[[^]]*\]|(?=\{)\{[^\}]*\}|\"[^"]*\")', re.MULTILINE | re.DOTALL) data = field.xpath('//script[contains(., "fbq(\'track\', \'INFORMATION\',")]/text()').re(pattern)[0] py_obj = demjson.decode(data) data_obj = json.dumps(py_obj) decoded = json.loads(data_obj) if "marca" in decoded: anuncio['marca'] = decoded['marca'] if "modelo" in decoded: anuncio['modelo'] = decoded['modelo'] if unicode('año', 'utf-8') in decoded: anuncio['ano'] = decoded[unicode('año', 'utf-8')] # print anuncio yield anuncio
def parse_thumb(self, response): hxs = scrapy.Selector(response) url = response.url clean_url = re.search('^(.*?)(?=\?|$)', response.url).group(0) car_id = re.sub("\?", "", re.search("(\d+)\?", url).group(0)) self.log('url: %s' % clean_url) fields = hxs.xpath("//div[@class='l-content__details-main col-xs-12 col-sm-8']") anuncio = ChileautosItem() if not fields: #anuncio['id'] = "ca_" + car_id #anuncio['url'] = clean_url #anuncio['vendido'] = {'add': 'NOW'} self.log('Anuncio vacio: %s' % clean_url) print fields else: for field in fields: ''' Carga de datos generales ''' anuncio['vendido'] = None # anuncio['id'] = "ca_" + url.replace("https://www.chileautos.cl/auto/usado/details/CL-AD-", "") anuncio['id'] = "ca_" + car_id anuncio['url'] = clean_url anuncio['header_nombre'] = ''.join(field.xpath('h1/text()').extract()).strip() anuncio['fecha_publicacion'] = {'add': ''.join( field.xpath('//div[@class="published-date"]/span/text()').extract()).strip()} anuncio['precio_det'] = ''.join( field.xpath('//h3[@class="key-features__price hidden-xs"]/text()').extract()).strip() anuncio['kilometros_det'] = ''.join(field.xpath( '//i[@class="csn-icons csn-icons-odometer"]/following-sibling::text()[1]').extract()).strip() anuncio['categoria'] = ''.join(field.xpath( '//i[@class="csn-icons csn-icons-garage"]/following-sibling::text()[1]').extract()).strip() anuncio['carroceria'] = ''.join(field.xpath( '//i[@class="csn-icons csn-icons-body"]/following-sibling::text()[1]').extract()).strip() anuncio['region'] = ''.join(field.xpath( '//i[@class="zmdi zmdi-pin"]/following-sibling::text()[1]').extract()).strip() anuncio['comentarios'] = ''.join(field.xpath( '//div[@class="car-comments col-xs-12"]/p/text()').extract()).strip() anuncio['img_url'] = ','.join(field.xpath( '//div[@class="item__image"]/div/@data-lazy-load-src').extract()).strip() # print anuncio['img_url'], anuncio['id'] ''' Carga de detalles destacados ''' anuncio['vehiculo_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="' + unicode('Vehículo', 'utf-8') + '"]/td/text()').extract()).strip() anuncio['precio_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="Precio"]/td/text()').extract()).strip() anuncio['kilometros_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="Kilometraje"]/td/text()').extract()).strip() anuncio['color_exterior_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="Color Exterior"]/td/text()').extract()).strip() anuncio['transmision_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="' + unicode('Transmisión', 'utf-8') + '"]/td/text()').extract()).strip() anuncio['puertas_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="Puertas"]/td/text()').extract()).strip() anuncio['pasajeros_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="Pasajeros"]/td/text()').extract()).strip() anuncio['combustible_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="Combustible"]/td/text()').extract()).strip() anuncio['consumo_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="Consumo de combustible (combinado)"]/td/text()').extract()).strip() anuncio['region_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="' + unicode('Región', 'utf-8') + '"]/td/text()').extract()).strip() anuncio['ciudad_det'] = ''.join(field.xpath( '//div[@id="tab-content--basic"]/table/tr[th/text()="Ciudad"]/td/text()').extract()).strip() anuncio['version'] = ''.join(field.xpath( '//table/tr[th/text()="' + unicode('Versión', 'utf-8') + '"]/td[1]/text()').extract()).strip() ''' Carga de Especificaciones Detalles ''' anuncio['tipo_vehiculo_det'] = ''.join(field.xpath( '//table[@class="table table-condensed table-striped"]/tr[th/text()="Tipo Vehiculo"]/td[1]/text()').extract()).strip() anuncio['tipo_categoria_det'] = ''.join(field.xpath( '//table[@class="table table-condensed table-striped"]/tr[th/text()="Tipo Categoria"]/td[1]/text()').extract()).strip() anuncio['version_det'] = ''.join(field.xpath( '//table[@class="table table-condensed table-striped"]/tr[th/text()="' + unicode('Versión', 'utf-8') + '"]/td[1]/text()').extract()).strip() ''' Carga de Especificaciones Equipamiento ''' anuncio['eq_air_acon'] = ''.join(field.xpath( '//table[@class="table table-condensed table-striped"]/tr[th/text()="Aire Acondicionado"]/td[1]/text()').extract()).strip() anuncio['eq_alzavid'] = ''.join(field.xpath( '//table[@class="table table-condensed table-striped"]/tr[th/text()="Alzavidrios Electricos"]/td[1]/text()').extract()).strip() anuncio['eq_airbag'] = ''.join(field.xpath( '//table[@class="table table-condensed table-striped"]/tr[th/text()="Airbag"]/td[1]/text()').extract()).strip() anuncio['eq_cierre_cent'] = ''.join(field.xpath( '//table[@class="table table-condensed table-striped"]/tr[th/text()="Cierre Centralizado"]/td[1]/text()').extract()).strip() anuncio['eq_llantas'] = ''.join(field.xpath( '//table[@class="table table-condensed table-striped"]/tr[th/text()="Llantas"]/td[1]/text()').extract()).strip() anuncio['eq_direccion'] = ''.join(field.xpath( '//table[@class="table table-condensed table-striped"]/tr[th/text()="' + unicode('Dirección', 'utf-8') + '"]/td[1]/text()').extract()).strip() anuncio['eq_techo'] = ''.join(field.xpath( '//table[@class="table table-condensed table-striped"]/tr[th/text()="Techo"]/td[1]/text()').extract()).strip() anuncio['eq_puertas'] = ''.join(field.xpath( '//table[@class="table table-condensed table-striped"]/tr[th/text()="Puertas"]/td[1]/text()').extract()).strip() anuncio['eq_cilindrada'] = ''.join(field.xpath( '//table[@class="table table-condensed table-striped"]/tr[th/text()="Cilindrada"]/td[1]/text()').extract()).strip() pattern = re.compile(r'((?=\[)\[[^]]*\]|(?=\{)\{[^\}]*\}|\"[^"]*\")', re.MULTILINE | re.DOTALL) data = field.xpath('//script[contains(., "fbq(\'track\', \'INFORMATION\',")]/text()').re(pattern)[0] py_obj = demjson.decode(data) data_obj = json.dumps(py_obj) decoded = json.loads(data_obj) if "marca" in decoded: anuncio['marca'] = decoded['marca'] if "modelo" in decoded: anuncio['modelo'] = decoded['modelo'] if unicode('año', 'utf-8') in decoded: anuncio['ano'] = decoded[unicode('año', 'utf-8')] ''' Carga de contacto ''' seller_link = ''.join(field.xpath('//tr[td/text()="Vendedor"]/td[2]/a/@href').extract()) if seller_link is not None and seller_link is not "": anuncio['contact_seller_url'] = self.domain_url + seller_link anuncio['contact_seller'] = ''.join(field.xpath( '//tr[td/text()="Vendedor"]/td[2]/a/text()').extract()).strip() anuncio['contact_name'] = ''.join(field.xpath( '//tr[td/text()="Contacto"]/td[2]/text()').extract()).strip() anuncio['contact_number'] = ', '.join(field.xpath( '//td[@id="phone"]/p/text()').extract()).strip() anuncio['contact_address'] = ''.join(field.xpath( '//tr[td/text()="' + unicode('Dirección', 'utf-8') + '"]/td[2]/text()').extract()).strip() anuncio['contact_comuna'] = ''.join(field.xpath( '//tr[td/text()="Comuna"]/td[2]/text()').extract()).strip() anuncio['contact_city'] = ''.join(field.xpath( '//tr[td/text()="Ciudad"]/td[2]/text()').extract()).strip() anuncio['contact_region'] = ''.join(field.xpath( '//tr[td/text()="' + unicode('Región', 'utf-8') + '"]/td[2]/text()').extract()).strip() yield anuncio