def parse_data(self, response): details = response.xpath('//div[@class="booking-info zxcv abcd"]') for det in details: item = AutodataItem() item2 = MetaItem() item["Last_Code_Update_Date"] = "" item["Scrapping_Date"] = "" item["Country"] = "KSA" item["City"] = "" item["Seller_Type"] = "Large Independent Dealers" item["Seller_Name"] = "Xcars" item["Car_URL"] = "" item["Car_Name"] = "" item["Year"] = "" item["Make"] = "".join( det.xpath('h2/span[1]/text()').extract()).strip() item["model"] = "".join( det.xpath('h2/span[2]/text()').extract()).strip() item["Spec"] = "".join( det.xpath('h2/span[2]/text()').extract()).strip() item["Car_Name"] = item["Make"] + ' ' + item["model"] item["Doors"] = "" item["transmission"] = "" item["trim"] = "" item["bodystyle"] = "" item["other_specs_gearbox"] = "" item["other_specs_seats"] = "" item["other_specs_engine_size"] = "" item["other_specs_horse_power"] = "" item["colour_exterior"] = "" item["colour_interior"] = "" item["fuel_type"] = "" item["import_yes_no_also_referred_to_as_GCC_spec"] = "" item["mileage"] = "" item["condition"] = "" item["warranty_untill_when"] = "" item['service_contract_untill_when'] = '' item['Price_Currency'] = '' item['asking_price_inc_VAT'] = '' item['asking_price_ex_VAT'] = '' item['warranty'] = '' item['service_contract'] = '' item['vat'] = 'yes' item['mileage_unit'] = '' item['engine_unit'] = '' item['Last_Code_Update_Date'] = 'Thursday, June 04, 2019' item['Scrapping_Date'] = datetime.today().strftime('%A, %B %d, %Y') item['autodata_Make'] = '' item['autodata_Make_id'] = '' item['autodata_model'] = '' item['autodata_model_id'] = '' item['autodata_Spec'] = '' item['autodata_Spec_id'] = '' item['autodata_transmission'] = '' item['autodata_transmission_id'] = '' item['autodata_bodystyle'] = '' item['autodata_bodystyle_id'] = '' for li in det.xpath('//div[@class="col-xs-7"]/ul/li'): key = ''.join(li.xpath('p[1]/text()').extract()).strip() value = ''.join(li.xpath('p[2]/text()').extract()).strip() if 'Price' in key: item['Price_Currency'] = value.split(' ')[-1] item['asking_price_inc_VAT'] = value.split(' ')[0] elif 'Year' in key: item['Year'] = value elif 'Mileage' in key: item['mileage'] = value.split(' ')[0] item['mileage_unit'] = value.split(' ')[-1] elif 'Exterior' in key: item["colour_exterior"] = value elif 'Interior' in key: item["colour_interior"] = value item2['src'] = "xcars.co" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "xcars" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item), sort_keys=True).encode('utf-8')).hexdigest() item['meta'] = dict(item2) item['Car_URL'] = response.url item['Source'] = item2['src'] yield item
def parse_data(self, response): sel = Selector(response) item2 = MetaItem() item1 = AutodataItem() item1["Last_Code_Update_Date"] = "" item1["Scrapping_Date"] = "" item1["Country"] = 'Syria' item1["City"] = ''.join( sel.xpath( '//div[@class="location-address module"]/section/div/span[@class="address-city"]/text()' ).extract()).strip() item1["Seller_Type"] = "Large Independent Dealers" item1["Seller_Name"] = "Al-Zayani" item1["Car_URL"] = response.url item1["Year"] = ''.join( sel.xpath( '//div[@class="overview-data module overview-data-standard"]/div/div/div[@class="cell reg-year"]/span[@class = "value reg-year"]/text()' ).extract()).strip() item1["Make"] = ''.join( sel.xpath( '//div[@class="title module align-center"]/h3/span[@class="make"]/text()' ).extract()).strip() item1["model"] = ''.join( sel.xpath( '//div[@class="title module align-center"]/h3/span[@class="model"]/text()' ).extract()).strip() item1["Spec"] = ''.join( sel.xpath( '//div[@class="title module align-center"]/h3/span[@class="variant"]/text()' ).extract()).strip() item1["Car_Name"] = item1["Make"] + ' ' + item1["model"] + ' ' + item1[ "Spec"] item1["Doors"] = "" item1["transmission"] = ''.join( sel.xpath( '//div[@class="overview-data module overview-data-standard"]/div/div/div[@class="cell transmission"]/span[@class = "value transmission"]/text()' ).extract()).strip() item1["trim"] = "" item1["bodystyle"] = ''.join( sel.xpath( '//div[@class="overview-data module overview-data-standard"]/div/div/div[@class="cell bodystyle"]/span[@class = "value bodystyle"]/text()' ).extract()).strip() item1["other_specs_gearbox"] = "" item1["other_specs_seats"] = "" item1["other_specs_engine_size"] = ' '.join(''.join( sel.xpath( '//div[@class="overview-data module overview-data-standard"]/div/div/div[@class="cell engine-size"]/span[@class = "value engine-size"]/text()' ).extract()).strip().split(' ')[:-1]) item1["other_specs_horse_power"] = "" item1["colour_exterior"] = ''.join( sel.xpath( '//div[@class="overview-data module overview-data-standard"]/div/div/div[@class="cell exterior-colour"]/span[@class = "value exterior-colour"]/text()' ).extract()).strip() item1["colour_interior"] = "" item1["fuel_type"] = ''.join( sel.xpath( '//div[@class="overview-data module overview-data-standard"]/div/div/div[@class="cell fuel-type"]/span[@class = "value fuel-type"]/text()' ).extract()).strip() item1["import_yes_no_also_referred_to_as_GCC_spec"] = "" item1["mileage"] = ' '.join(''.join( sel.xpath( '//div[@class="overview-data module overview-data-standard"]/div/div/div[@class="cell mileage"]/span[@class = "value mileage"]/text()' ).extract()).strip().split(' ')[:-1]) item1["condition"] = "" item1["warranty_untill_when"] = "" item1['service_contract_untill_when'] = '' item1['Price_Currency'] = ''.join( sel.xpath('//div[@class="price-now"]/span[@class="value"]/text()'). extract()).strip().split(' ')[0] item1['asking_price_inc_VAT'] = ''.join( sel.xpath('//div[@class="price-now"]/span[@class="value"]/text()'). extract()).strip().split(' ')[1] item1['asking_price_ex_VAT'] = '' item1['warranty'] = '' item1['service_contract'] = '' item1['vat'] = 'yes' item1['autodata_Make'] = '' item1['autodata_Make_id'] = '' item1['autodata_model'] = '' item1['autodata_model_id'] = '' item1['autodata_Spec'] = '' item1['autodata_Spec_id'] = '' item1['autodata_transmission'] = '' item1['autodata_transmission_id'] = '' item1['autodata_bodystyle'] = '' item1['autodata_bodystyle_id'] = '' item1['mileage_unit'] = ''.join( sel.xpath( '//div[@class="overview-data module overview-data-standard"]/div/div/div[@class="cell mileage"]/span[@class = "value mileage"]/text()' ).extract()).strip().split(' ')[-1] item1['engine_unit'] = ''.join( sel.xpath( '//div[@class="overview-data module overview-data-standard"]/div/div/div[@class="cell engine-size"]/span[@class = "value engine-size"]/text()' ).extract()).strip().split(' ')[-1] item2['src'] = "al-zayani.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "zayani" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item1), sort_keys=True).encode('utf-8')).hexdigest() item1['meta'] = dict(item2) item1['Last_Code_Update_Date'] = 'June 13, 2019' item1['Scrapping_Date'] = datetime.today().strftime('%Y-%m-%d') item1['Source'] = item2['src'] yield item1
def parse_data(self, response): item=AutodataItem() item2 = MetaItem() item["Last_Code_Update_Date"] = "" item["Scrapping_Date"] = "" item["Country"] = "" item["City"] = "" item["Seller_Type"] = "" item["Seller_Name"] = "" item["Car_URL"] = "" item["Car_Name"] = "" item["Year"] = "" item["Make"] = "" item["model"] = "" item["Spec"] = "" item["Doors"] = "" item["transmission"] = "" item["trim"] = "" item["bodystyle"] = "" item["other_specs_gearbox"] = "" item["other_specs_seats"] = "" item["other_specs_engine_size"] = "" item["other_specs_horse_power"] = "" item["colour_exterior"] = "" item["colour_interior"] = "" item["fuel_type"] = "" item["import_yes_no_also_referred_to_as_GCC_spec"] = "" item["mileage"] = "" item["condition"] = "" item["warranty_untill_when"] = "" item['service_contract_untill_when'] = '' item['Price_Currency'] = '' item['asking_price_inc_VAT'] = '' item['asking_price_ex_VAT'] = '' item['warranty'] = '' item['service_contract'] = '' item['vat'] = 'yes' item['mileage_unit'] = '' item['engine_unit'] = '' item['autodata_Make'] = '' item['autodata_Make_id'] = '' item['autodata_model'] = '' item['autodata_model_id'] = '' item['autodata_Spec'] = '' item['autodata_Spec_id'] = '' item['autodata_transmission'] = '' item['autodata_transmission_id'] = '' item['autodata_bodystyle'] = '' item['autodata_bodystyle_id'] = '' item["Last_Code_Update_Date"] = "Wednesday,June 19,2019" item["Scrapping_Date"] = datetime.today().strftime('%A, %B %d, %Y') item["Country"] = "UAE" item["City"] = "Dubai" item["Seller_Type"] = "MarketPlace" item["Seller_Name"] = "111 Used Cars" item["Car_URL"] = response.url item2['src'] = "dubicars.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "dubi_spider" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5(json.dumps(dict(item), sort_keys=True).encode('utf-8')).hexdigest() item['meta'] = dict(item2) item['Source'] = item2['src'] item['asking_price_inc_VAT'] = response.xpath("//strong[contains(@class,'money')]/text()").extract()[1].split('AED')[-1].strip() item['Price_Currency'] = 'AED' arr = response.xpath("//tr/td/text()").extract() item['Year'] = str(arr[2]) item['Make'] = str(arr[0]) item['model'] = arr[1] item["Car_Name"] =item['Make']+' '+item['model'] if item['Car_Name'] != '': item['Spec'] = arr[8].strip() item['transmission'] = arr[9] item['bodystyle'] = arr[5] #item['other_specs_horse_power'] = item['colour_exterior'] = arr[4] item['fuel_type'] = arr[10] item['mileage'] = arr[6] item['mileage_unit'] = 'km' item['colour_interior'] = arr[13] item['other_specs_seats'] = arr[11] yield item
def parse_data(self, response): ## item1 = BentleyMuscatItem() ## item2 = MetaItem() item2 = MetaItem() item1 = AutodataItem() item1["Last_Code_Update_Date"] = "" item1["Scrapping_Date"] = "" item1["Country"] = "" item1["City"] = "" item1["Seller_Type"] = "" item1["Seller_Name"] = "Bentley Motors Muscat" item1["Car_URL"] = "" item1["Car_Name"] = "" item1["Year"] = "" item1["Make"] = "" item1["model"] = "" item1["Spec"] = "" item1["Doors"] = "" item1["transmission"] = "" item1["trim"] = "" item1["bodystyle"] = "" item1["other_specs_gearbox"] = "" item1["other_specs_seats"] = "" item1["other_specs_engine_size"] = "" item1["other_specs_horse_power"] = "" item1["colour_exterior"] = "" item1["colour_interior"] = "" item1["fuel_type"] = "" item1["import_yes_no_also_referred_to_as_GCC_spec"] = "" item1["mileage"] = "" item1["condition"] = "" item1["warranty_untill_when"] = "" item1['service_contract_untill_when'] = '' item1['Price_Currency'] = '' item1['asking_price_inc_VAT'] = '' item1['asking_price_ex_VAT'] = '' item1['warranty'] = '' item1['service_contract'] = '' item1['vat'] = 'yes' item1['engine_unit'] = '' item1['mileage_unit'] = '' item1['autodata_Make'] = '' item1['autodata_Make_id'] = '' item1['autodata_model'] = '' item1['autodata_model_id'] = '' item1['autodata_Spec'] = '' item1['autodata_Spec_id'] = '' item1['autodata_transmission'] = '' item1['autodata_transmission_id'] = '' item1['autodata_bodystyle'] = '' item1['autodata_bodystyle_id'] = '' item1['wheel_size'] = '' item1['top_speed_kph'] = '' item1['cylinders'] = '' item1['acceleration'] = '' item1['torque_Nm'] = '' sel = Selector(response) item1["Car_URL"] = response.url item1["Car_Name"] = ' '.join(''.join( sel.xpath('//div[@class="vehicle-title column block"]/h1//text()'). extract()).strip().split(' ')[2:]) item1["Price_Currency"] = ''.join( sel.xpath('//div[@class="vehicle-prive column block"]/div//text()' ).extract()).strip()[:3] item1["asking_price_inc_VAT"] = ''.join( sel.xpath('//div[@class="vehicle-prive column block"]/div//text()' ).extract()).strip()[4:] lis = sel.xpath( '//ul[@class="unstyle tiles-container-10 s-space-5 vertical-collapse"]/li' ) for l in lis: key = ''.join( l.xpath('div/div[@class="column s50 m50 l50 bold"]//text()'). extract()).strip() item = ''.join( l.xpath( 'div/div[@class="column s50 m50 l50 vertical-top"]//text()' ).extract()).strip() if key.lower() == "body style": item1["bodystyle"] = item elif key.lower() == "paint colour": item1["colour_exterior"] = item elif key.lower() == "registration date": item1["Year"] = item.split('.')[-1] elif key.lower() == "mileage": item1["mileage"] = item[:-2] item1['mileage_unit'] = item[-2:] elif key.lower() == "transmission": item1["transmission"] = item elif key.lower() == "engine": item1["other_specs_engine_size"] = item.split(' ')[0] item1['engine_unit'] = item.split(' ')[1] elif key.lower() == "torque": item1['torque_Nm'] = item.split(' ')[0] elif "acceleration" in key.lower(): item1['acceleration'] = item.split(' ')[0] elif "maximum speed" in key.lower(): item1['top_speed_kph'] = item.split(' ')[0] elif key.lower() == "power": item1["other_specs_horse_power"] = int( item.replace(u'\xa0', u' ').replace( ',', '').split(' ')[0]) * 1.34102 item1["Make"] = 'Bentley' item1["model"] = ' '.join(item1["Car_Name"].split(' ')[1:-1]) item1['Spec'] = item1["Car_Name"].split(' ')[-1] item1["Country"] = "Oman" item1["City"] = "Muscat" item1['Last_Code_Update_Date'] = 'Thursday, June 04, 2019' item1['Scrapping_Date'] = datetime.today().strftime('%A, %B %d, %Y') item2['src'] = "muscat.bentleymotors.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "bentley" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item1), sort_keys=True).encode('utf-8')).hexdigest() item1['meta'] = dict(item2) item1['Source'] = item2['src'] yield item1
def parse_data(self, response): item2 = MetaItem() item1 = AutodataItem() item1["Last_Code_Update_Date"] = "" item1["Scrapping_Date"] = "" item1["Country"] = "Oman" item1["City"] = "" item1["Seller_Type"] = "Market Places" item1["Seller_Name"] = "Opensooq" item1["Car_URL"] = response.url item1["Car_Name"] = '' item1["Year"] = "" item1["Make"] = "" item1["model"] = '' item1["Spec"] = "" item1["Doors"] = "" item1["transmission"] = "" item1["trim"] = "" item1["bodystyle"] = "" item1["other_specs_gearbox"] = "" item1["other_specs_seats"] = "" item1["other_specs_engine_size"] = "" item1["other_specs_horse_power"] = "" item1["colour_exterior"] = "" item1["colour_interior"] = "" item1["fuel_type"] = "" item1["import_yes_no_also_referred_to_as_GCC_spec"] = "" item1["mileage"] = "" item1["condition"] = "" item1["warranty_untill_when"] = "" item1['service_contract_untill_when'] = '' item1['Price_Currency'] = '' item1['asking_price_inc_VAT'] = '' item1['asking_price_ex_VAT'] = '' item1['warranty'] = '' item1['service_contract'] = '' item1['vat'] = 'yes' item1['mileage_unit'] = '' item1['engine_unit'] = '' item1['autodata_Make'] = '' item1['autodata_Make_id'] = '' item1['autodata_model'] = '' item1['autodata_model_id'] = '' item1['autodata_Spec'] = '' item1['autodata_Spec_id'] = '' item1['autodata_transmission'] = '' item1['autodata_transmission_id'] = '' item1['autodata_bodystyle'] = '' item1['autodata_bodystyle_id'] = '' details = response.xpath('//div[@class="customP"]/ul/li') for det in details: key = ''.join(det.xpath('span/text()').extract()).strip() value = ''.join(det.xpath('a/text()').extract()).replace('\"','').strip() if 'city' in key.lower(): item1["City"] = value elif 'make' in key.lower(): item1["Make"] = value elif 'model' in key.lower(): item1["model"] = value elif 'year' in key.lower(): item1["Year"] = value elif 'condition' in key.lower(): item1["condition"] = value elif 'kilometers' in key.lower(): item1["mileage"] = value.split(' ')[0].replace('+','') elif 'transmission' in key.lower(): item1["transmission"] = value elif 'fuel' in key.lower(): item1["fuel_type"] = value elif 'color' in key.lower(): item1["colour_exterior"] = value elif 'price' in key.lower(): item1["Price_Currency"] = value item1['asking_price_inc_VAT'] = ''.join(det.xpath('a/strong/text()').extract()).strip() item1["Car_Name"] = item1["Make"] + ' ' + item1["model"] item2['src'] = "om.opensooq.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "opensooq_om" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5(json.dumps(dict(item1), sort_keys=True).encode('utf-8')).hexdigest() item1['meta'] = dict(item2) item1['Last_Code_Update_Date'] = 'Tuesday, June 18, 2019' item1['Scrapping_Date'] = datetime.today().strftime('%A, %B %d, %Y') item1['Source'] = item2['src'] if item1['Car_Name'] != '': yield item1
def parse_data(self, response): item = AutodataItem() item2 = MetaItem() item["Last_Code_Update_Date"] = "" item["Scrapping_Date"] = "" item["Country"] = "Qatar" item["City"] = "" item["Seller_Type"] = "Market Places" item["Seller_Name"] = "Friday cars" item["Car_URL"] = response.url item["Car_Name"] = "" item["Year"] = "" item["Make"] = "" item["model"] = "" item["Spec"] = "" item["Doors"] = "" item["transmission"] = "" item["trim"] = "" item["bodystyle"] = "" item["other_specs_gearbox"] = "" item["other_specs_seats"] = "" item["other_specs_engine_size"] = "" item["other_specs_horse_power"] = "" item["colour_exterior"] = "" item["colour_interior"] = "" item["fuel_type"] = "" item["import_yes_no_also_referred_to_as_GCC_spec"] = "" item["mileage"] = "" item["condition"] = "" item["warranty_untill_when"] = "" item['service_contract_untill_when'] = '' item['Price_Currency'] = 'QAR' item['asking_price_inc_VAT'] = '' item['asking_price_ex_VAT'] = '' item['warranty'] = '' item['service_contract'] = '' item['vat'] = 'yes' item['mileage_unit'] = '' item['engine_unit'] = '' item['Last_Code_Update_Date'] = 'June 22, 2019' item['Scrapping_Date'] = datetime.today().strftime('%Y-%m-%d') item['autodata_Make'] = '' item['autodata_Make_id'] = '' item['autodata_model'] = '' item['autodata_model_id'] = '' item['autodata_Spec'] = '' item['autodata_Spec_id'] = '' item['autodata_transmission'] = '' item['autodata_transmission_id'] = '' item['autodata_bodystyle'] = '' item['autodata_bodystyle_id'] = '' labels = response.xpath('//td[@class="viewad-label"]') values = response.xpath('//span[@class="viewad-data"]') for lab in range(len(labels)): label = ''.join(labels[lab].xpath('text()').extract()).strip() value = ''.join(values[lab].xpath('text()').extract()).strip() if 'Brand' in label: item['Make'] = value elif 'Model' in label: item['model'] = value elif 'Year' in label: item['Year'] = value elif 'Location' in label: item['City'] = value elif "Price" in label: item["asking_price_inc_VAT"] = value.split(' ')[0] print(item['Car_Name']) item["Car_Name"] = remove_non_ascii(''.join( response.xpath( '//div[@class = "panel-body alone pad0 viewad-topinfo"]/h1/text()' ).extract()).strip()) print(item['Car_Name']) item2['src'] = "http://qa.fridaymarket.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "fridaymarket" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item), sort_keys=True).encode('utf-8')).hexdigest() item['meta'] = dict(item2) item['Car_URL'] = response.url item['Source'] = item2['src'] if item['Car_Name'] != '': yield item pass
def parse_dir_contents(self,response): item = AutodataItem() item2 = MetaItem() item["Last_Code_Update_Date"] = "" item["Scrapping_Date"] = "" item["Country"] = "" item["City"] = "" item["Seller_Type"] = "Large Independent Dealers" item["Seller_Name"] = "Dhofar Automotive" item["Car_URL"] = response.url item["Car_Name"] = "" item["Year"] = "" item["Make"] = "" item["model"] = "" item["Spec"] = "" item["Doors"] = "" item["transmission"] = "" item["trim"] = "" item["bodystyle"] = "" item["other_specs_gearbox"] = "" item["other_specs_seats"] = "" item["other_specs_engine_size"] = "" item["other_specs_horse_power"] = "" item["colour_exterior"] = "" item["colour_interior"] = "" item["fuel_type"] = "" item["import_yes_no_also_referred_to_as_GCC_spec"] = "" item["mileage"] = "" item["condition"] = "" item["warranty_untill_when"] = "" item['service_contract_untill_when'] = '' item['Price_Currency'] = '' item['asking_price_inc_VAT'] = '' item['asking_price_ex_VAT'] = '' item['warranty'] = '' item['service_contract'] = '' item['vat'] = 'yes' item['mileage_unit'] = '' item['engine_unit'] = '' item['autodata_Make'] = '' item['autodata_Make_id'] = '' item['autodata_model'] = '' item['autodata_model_id'] = '' item['autodata_Spec'] = '' item['autodata_Spec_id'] = '' item['autodata_transmission'] = '' item['autodata_transmission_id'] = '' item['autodata_bodystyle'] = '' item['autodata_bodystyle_id'] = '' item['Last_Code_Update_Date'] = 'June 6, 2019' item['Scrapping_Date'] = datetime.today().strftime('%Y-%m-%d') item2['src'] = "dhofarautomotive.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "dhofar" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5(json.dumps(dict(item), sort_keys=True).encode('utf-8')).hexdigest() item['meta'] = dict(item2) item['Source'] = item2['src'] item['asking_price_inc_VAT'] = response.xpath("//ul[contains(@class, 'omr1')]/li/span/text()").extract()[0] item['Price_Currency'] = 'OMR' item['Make'] = response.xpath("//ul[contains(@class, 'fist_sec')]/li/text()").extract()[0] item['model'] = response.xpath("//ul[contains(@class, 'fist_for')]/li/text()").extract()[0] item['Car_Name'] = item['Make'] + ' ' + item['model'] item['Year'] = response.xpath("//ul[contains(@class, 'fist_sec')]/li/text()").extract()[1] item['colour_exterior'] = response.xpath("//ul[contains(@class, 'fist_sec')]/li/text()").extract()[2] item['transmission'] = response.xpath("//ul[contains(@class, 'fist_sec')]/li/text()").extract()[3] item['mileage'] = response.xpath("//ul[contains(@class, 'fist_for')]/li/text()").extract()[1].strip() item['mileage_unit'] = 'km' item['Doors'] = response.xpath("//ul[contains(@class, 'fist_for')]/li/text()").extract()[2] item['bodystyle'] = response.xpath("//ul[contains(@class, 'fist_for')]/li/text()").extract()[3] yield item
def parse(self,response): body = json.loads(response.body) data = body['results'] print(len(data)) for res in data: item2 = MetaItem() item1 = AutodataItem() item1["Last_Code_Update_Date"] = "" item1["Scrapping_Date"] = "" item1["trim"] = '' item1["Country"] = "Saudi Arabia" item1["City"] = res['vehicle']['location']['city'] item1["Seller_Type"] = "Official Dealers" item1["Seller_Name"] = "Mohamed Yousuf Naghi - BMW" item1["Car_URL"] = "" item1["bodystyle"] = res['vehicle']['bodyType']['pl_PL'] item1["Car_Name"] = "" item1["Year"] = str(res['vehicle']['constructionYear']) item1["Make"] = res['vehicle']['make'] item1["Doors"] = "" item1["transmission"] = res['vehicle']['transmission']['key'].split('.')[1] if 'seatFabric' in res['vehicle']: item1["trim"] = res['vehicle']['seatFabric']['key'].split('.')[1] item1["model"] = res['vehicle']['model'].replace(item1['bodystyle'],'') item1["Spec"] = res['vehicle']['vehicleVersion'].split('-')[0].replace(item1['bodystyle'],'').strip() item1["other_specs_gearbox"] = "" item1["other_specs_seats"] = "" item1["other_specs_engine_size"] = "" item1["other_specs_horse_power"] = res['vehicle']['power_hp'] item1["colour_exterior"] = res['vehicle']['bodyColor']['key'].split('.')[1] item1["colour_interior"] = res['vehicle']['interiorColor']['key'].split('.')[1] item1["fuel_type"] = res['vehicle']['fuel']['key'].split('.')[1] item1["import_yes_no_also_referred_to_as_GCC_spec"] = "" item1["mileage"] = res['vehicle']['mileage_km'] item1["condition"] = "" item1["warranty_untill_when"] = "" item1['service_contract_untill_when'] = '' item1['Price_Currency'] = res["retailpricing"]["applicableCurrency"] item1['asking_price_inc_VAT'] = res['priceInclusiveVAT'] item1['asking_price_ex_VAT'] = res['priceExclusiveVAT'] item1['warranty'] = '' item1['service_contract'] = '' item1['vat'] = 'yes' item1['mileage_unit'] = 'km' item1['engine_unit'] = '' item1['autodata_Make'] = '' item1['autodata_Make_id'] = '' item1['autodata_model'] = '' item1['autodata_model_id'] = '' item1['autodata_Spec'] = '' item1['autodata_Spec_id'] = '' item1['autodata_transmission'] = '' item1['autodata_transmission_id'] = '' item1['autodata_bodystyle'] = '' item1['autodata_bodystyle_id'] = '' item1['wheel_size'] = '' item1['top_speed_kph'] = '' item1['cylinders'] = '' if 'numberOfCylinders' in res['vehicle']: item1['cylinders'] = res['vehicle']['numberOfCylinders'] item1['acceleration'] = '' item1['torque_Nm'] = '' item1['Last_Code_Update_Date'] = 'Wednesday, July 03, 2019' item1['Scrapping_Date'] = datetime.today().strftime('%A, %B %d, %Y') item1["Car_URL"] = response.url item1["Car_Name"] = item1["Make"] + ' ' + item1["model"] + item1["Spec"] item2['src'] = "bmw-saudiarabia.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "bmw_sa" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5(json.dumps(dict(item1), sort_keys=True).encode('utf-8')).hexdigest() item1['meta'] = dict(item2) item1['Source'] = item2['src'] yield item1 count = int(response.url.split('start=')[1]) + 20 if len(data) != 0: url = "https://retailcomponent.salescre8.com/v1/services/retail-component/publications/search?channelId=www_bmwsa&childQuery=&locale=en_GB&parentQuery=&profile=bmw&rows=20&sort=&sortOrder=&start="+ str(count) yield Request(url,callback=self.parse,meta={"url":url,"body":body})
def parse_dir_contents(self, response): item = AutodataItem() item2 = MetaItem() item["Last_Code_Update_Date"] = "" item["Scrapping_Date"] = "" item["Country"] = ''.join( response.xpath('//span[@class = "address-country"]/text()'). extract()).strip() item["City"] = ''.join( response.xpath( '//span[@class = "address-city"]/text()').extract()).strip() item["Seller_Type"] = "" item["Seller_Name"] = "" item["Car_URL"] = "" item["Car_Name"] = "" item["Year"] = "" item["Make"] = "" item["model"] = "" item["Spec"] = "" item["Doors"] = "" item["transmission"] = "" item["trim"] = "" item["bodystyle"] = "" item["other_specs_gearbox"] = "" item["other_specs_seats"] = "" item["other_specs_engine_size"] = "" item["other_specs_horse_power"] = "" item["colour_exterior"] = "" item["colour_interior"] = "" item["fuel_type"] = "" item["import_yes_no_also_referred_to_as_GCC_spec"] = "" item["mileage"] = "" item["condition"] = "" item["warranty_untill_when"] = "" item['service_contract_untill_when'] = '' item['Price_Currency'] = '' item['asking_price_inc_VAT'] = '' item['asking_price_ex_VAT'] = '' item['warranty'] = '' item['service_contract'] = '' item['vat'] = 'yes' item['mileage_unit'] = '' item['engine_unit'] = '' item['Last_Code_Update_Date'] = 'June 6, 2019' item['Scrapping_Date'] = datetime.today().strftime('%Y-%m-%d') item['autodata_Make'] = '' item['autodata_Make_id'] = '' item['autodata_model'] = '' item['autodata_model_id'] = '' item['autodata_Spec'] = '' item['autodata_Spec_id'] = '' item['autodata_transmission'] = '' item['autodata_transmission_id'] = '' item['autodata_bodystyle'] = '' item['autodata_bodystyle_id'] = '' item2['src'] = "astonmartin.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "astonmartin_kw" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item), sort_keys=True).encode('utf-8')).hexdigest() item['meta'] = dict(item2) item['Car_URL'] = response.url item['Source'] = item2['src'] item['asking_price_inc_VAT'] = ''.join( response.xpath( "//div[contains(@class,'price-now')]/span[contains(@class,'value')]/text()" ).extract()).strip().split(' ')[1] item['Price_Currency'] = ''.join( response.xpath( "//div[contains(@class,'price-now')]/span[contains(@class,'value')]/text()" ).extract()).strip().split(' ')[0] item['Year'] = response.xpath( "//div[contains(@class, 'title module align-center')]/h3/span[contains(@class, 'year')]/text()" ).extract()[0].strip() item['Make'] = response.xpath( "//div[contains(@class, 'title module align-center')]/h3/span[contains(@class, 'make')]/text()" ).extract()[0].strip() item['model'] = response.xpath( "//div[contains(@class, 'title module align-center')]/h3/span[contains(@class, 'model')]/text()" ).extract()[0].strip() item['transmission'] = response.xpath( "//div[contains(@class, 'cell transmission')]/span[contains(@class, 'value transmission')]/text()" ).extract()[0].strip() item['mileage'] = ((response.xpath( "//div[contains(@class, 'cell mileage')]/span[contains(@class, 'value mileage')]/text()" ).extract()[0]).split('km')[0]).strip() item['mileage_unit'] = 'km' item['other_specs_engine_size'] = ((response.xpath( "//div[contains(@class, 'span6')]/div[contains(@class,'custom-html module align-center tech-spec')]/table/tr/td[contains(@class,'value')]/text()" ).extract()[0]).split('Litre')[0]).strip() item['engine_unit'] = 'Litre' item['colour_exterior'] = response.xpath( "//div[contains(@class, 'cell exterior-colour')]/span[contains(@class, 'value exterior-colour')]/text()" ).extract()[0].strip() item['other_specs_horse_power'] = ((response.xpath( "//div[contains(@class, 'span6')]/div[contains(@class,'custom-html module align-center tech-spec')]/table/tr/td[contains(@class,'value')]/text()" ).extract()[3]).split("BHP")[0]).strip() item['colour_interior'] = response.xpath( "//div[contains(@class, 'cell interior-colour')]/span[contains(@class, 'value interior-colour')]/text()" ).extract()[0].strip() item['bodystyle'] = (response.xpath( "//div[contains(@class, 'title module align-center')]/h3/span[contains(@class, 'variant')]/text()" ).extract()[0]).split()[-1] item['Car_Name'] = item['Make'] + ' ' + item['model'] + ' ' + item[ 'bodystyle'] yield item
def parse(self, response): sel = Selector(response) data = json.loads(response.body) body = "".join(map(chr, response.body)) i = 0 count = 0 while (count<len(data)): d = str(i) if d in data: item2 = MetaItem() item1 = AutodataItem() item1["Last_Code_Update_Date"] = "" item1["Scrapping_Date"] = "" item1["Country"] = "" item1["City"] = "" item1["Seller_Type"] = "" item1["Seller_Name"] = "" item1["Car_URL"] = "" item1["Car_Name"] = "" item1["Year"] = "" item1["Make"] = "" item1["model"] = "" item1["Spec"] = "" item1["Doors"] = "" item1["transmission"] = "" item1["trim"] = "" item1["bodystyle"] = "" item1["other_specs_gearbox"] = "" item1["other_specs_seats"] = "" item1["other_specs_engine_size"] = "" item1["other_specs_horse_power"] = "" item1["colour_exterior"] = "" item1["colour_interior"] = "" item1["fuel_type"] = "" item1["import_yes_no_also_referred_to_as_GCC_spec"] = "" item1["mileage"] = "" item1["condition"] = "" item1["warranty_untill_when"] = "" item1['service_contract_untill_when'] = '' item1['Price_Currency'] = '' item1['asking_price_inc_VAT'] = '' item1['asking_price_ex_VAT'] = '' item1['warranty'] = '' item1['service_contract'] = '' item1['vat'] = '' item1["mileage_unit"] = 'km' item1["engine_unit"] = '' item1['autodata_Make'] = '' item1['autodata_Make_id'] = '' item1['autodata_model'] = '' item1['autodata_model_id'] = '' item1['autodata_Spec'] = '' item1['autodata_Spec_id'] = '' item1['autodata_transmission'] = '' item1['autodata_transmission_id'] = '' item1['autodata_bodystyle'] = '' item1['autodata_bodystyle_id'] = '' print("Yes") item1["Seller_Type"] = data[d]["SALE_TYPE"] item1["Seller_Name"] = data[d]["SELLER_NAME"] item1["Car_Name"] = data[d]["DESCRIPTION"] item1["Year"] = str(data[d]["MODEL_YEAR"]) item1["Make"] = data[d]["MAKE_DESC"] item1["model"] = data[d]["MODEL_DESC"] item1["bodystyle"] = data[d]["BODY_TYPE_DESC"] item1["other_specs_engine_size"] = data[d]["ENGINE_SIZE"].split(' ')[0] item1["engine_unit"] = data[d]["ENGINE_SIZE"].split(' ')[1] item1["colour_exterior"] = data[d]["COLOR"] item1["fuel_type"] = data[d]["FUEL_TYPE_DESC"] item1["mileage"] = data[d]["MILEAGE"] item1["Spec"] = data[d]["VARIANT"] item1["trim"] = data[d]["TRIM_TYPE"] item1["colour_interior"] = data[d]["INTERIOR_COLOR"] item1["City"] = "Dubai" item1["asking_price_inc_VAT"] = data[d]["PRICE"] item1["Price_Currency"] = "AED" item1["Country"] = "Saudi Arabia" item1["vat"] = "yes" item1["Car_URL"] = "https://en.nissan-dubai.com/certified-preowned-cars/buy-a-car.html" if data[d]["TRNS_TYPE_DESC"] == "A/T": item1["transmission"] = "Automatic" else: item1["transmission"] = data[d]["TRNS_TYPE_DESC"] item1["warranty_untill_when"] = data[d]["WAR_EXP_DATE"] if item1["warranty_untill_when"] != "": item1["warranty"] = "yes" item2['src'] = "en.nissan-dubai.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "nissan" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5(json.dumps(dict(item1), sort_keys=True).encode('utf-8')).hexdigest() item1['meta'] = dict(item2) item1['Source'] = item2['src'] item1['Last_Code_Update_Date'] = 'June 7, 2019' item1['Scrapping_Date'] = datetime.today().strftime('%Y-%m-%d') count = count+1 yield item1 i = i+1 print(count, i) pass
def parse_data(self, response): item2 = MetaItem() item1 = AutodataItem() details = response.xpath('//table[@class="table"]') item1["Last_Code_Update_Date"] = "" item1["Scrapping_Date"] = "" item1["Country"] = "Oman" item1["City"] = "".join( response.xpath( '//tr[@class="listing_category_location"]/td[2]/text()'). extract()).strip().split(',')[0] item1["Seller_Type"] = "Large Independent Dealers" item1["Seller_Name"] = "Alfarooq Automotive" item1["Car_URL"] = response.url item1["Car_Name"] = ''.join( response.xpath( '//div[@class="col-lg-9 col-md-9 col-sm-9 col-xs-12 xs-padding-none"]/h2/text()' ).extract()).strip() item1["Year"] = "".join( response.xpath('//tr[@class="listing_category_year"]/td[2]/text()' ).extract()).strip() item1["Make"] = "".join( response.xpath('//tr[@class="listing_category_make"]/td[2]/text()' ).extract()).strip() item1["model"] = ''.join( response.xpath('//tr[@class="listing_category_model"]/td[2]/text()' ).extract()).strip() item1["Spec"] = "" item1["Doors"] = "" item1["transmission"] = "".join( response.xpath( '//tr[@class="listing_category_transmission"]/td[2]/text()'). extract()).strip().split(' ')[-1] item1["trim"] = "" item1["bodystyle"] = "".join( response.xpath( '//tr[@class="listing_category_body-style"]/td[2]/text()'). extract()).strip() item1["other_specs_gearbox"] = "" item1["other_specs_seats"] = "" item1["other_specs_engine_size"] = "".join( response.xpath( '//tr[@class="listing_category_engine"]/td[2]/text()').extract( )).strip().split('L')[0] item1["other_specs_horse_power"] = "" item1["colour_exterior"] = "".join( response.xpath( '//tr[@class="listing_category_exterior-color"]/td[2]/text()'). extract()).strip() item1["colour_interior"] = "".join( response.xpath( '//tr[@class="listing_category_interior-color"]/td[2]/text()'). extract()).strip() item1["fuel_type"] = "" item1["import_yes_no_also_referred_to_as_GCC_spec"] = "" item1["mileage"] = "".join( response.xpath( '//tr[@class="listing_category_mileage"]/td[2]/text()'). extract()).strip() item1["condition"] = "".join( response.xpath( '//tr[@class="listing_category_condition"]/td[2]/text()'). extract()).strip() item1["warranty_untill_when"] = "" item1['service_contract_untill_when'] = '' item1['Price_Currency'] = 'OMR' item1['asking_price_inc_VAT'] = ''.join( response.xpath( '//div[@class="col-lg-3 col-md-3 col-sm-3 text-right xs-padding-none"]/h2/text()' ).extract()).strip() item1['asking_price_ex_VAT'] = '' item1['warranty'] = '' item1['service_contract'] = '' item1['vat'] = 'yes' item1['mileage_unit'] = 'km' item1['engine_unit'] = 'L' item1['autodata_Make'] = '' item1['autodata_Make_id'] = '' item1['autodata_model'] = '' item1['autodata_model_id'] = '' item1['autodata_Spec'] = '' item1['autodata_Spec_id'] = '' item1['autodata_transmission'] = '' item1['autodata_transmission_id'] = '' item1['autodata_bodystyle'] = '' item1['autodata_bodystyle_id'] = '' item2['src'] = "alfarooqautomotive.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "alfarooq" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item1), sort_keys=True).encode('utf-8')).hexdigest() item1['meta'] = dict(item2) item1['Last_Code_Update_Date'] = 'Tuesday, June 18, 2019' item1['Scrapping_Date'] = datetime.today().strftime('%A, %B %d, %Y') item1['Source'] = item2['src'] yield item1
def parse_data(self, response): data1 = str(response.body) data2 = data1[data1.index('{'):-3] json_acceptable_string = data2.replace("'", "\"") d = json.loads(json_acceptable_string) rows = d['result']['rows'] for row in rows: item2 = MetaItem() item1 = AutodataItem() item1["Last_Code_Update_Date"] = "" item1["Scrapping_Date"] = "" item1["Country"] = "" item1["City"] = "" item1["Seller_Type"] = "" item1["Seller_Name"] = "" item1["Car_URL"] = "" item1["Car_Name"] = "" item1["Year"] = "" item1["Make"] = "" item1["model"] = "" item1["Spec"] = "" item1["Doors"] = "" item1["transmission"] = "" item1["trim"] = "" item1["bodystyle"] = "" item1["other_specs_gearbox"] = "" item1["other_specs_seats"] = "" item1["other_specs_engine_size"] = "" item1["other_specs_horse_power"] = "" item1["colour_exterior"] = "" item1["colour_interior"] = "" item1["fuel_type"] = "" item1["import_yes_no_also_referred_to_as_GCC_spec"] = "" item1["mileage"] = "" item1["condition"] = "" item1["warranty_untill_when"] = "" item1['service_contract_untill_when'] = '' item1['Price_Currency'] = '' item1['asking_price_inc_VAT'] = '' item1['asking_price_ex_VAT'] = '' item1['warranty'] = '' item1['service_contract'] = '' item1['vat'] = '' item1['engine_unit'] = '' item1['autodata_Make'] = '' item1['autodata_Make_id'] = '' item1['autodata_model'] = '' item1['autodata_model_id'] = '' item1['autodata_Spec'] = '' item1['autodata_Spec_id'] = '' item1['autodata_transmission'] = '' item1['autodata_transmission_id'] = '' item1['autodata_bodystyle'] = '' item1['autodata_bodystyle_id'] = '' item1['model'] = row['modelName'] item1['Year'] = str(row['modelYear']) item1['mileage'] = row['mileage'].split(' ')[0] item1['mileage_unit'] = row['mileage'].split(' ')[1] item1['other_specs_horse_power'] = int( row['maxPowerKw'].split(' ')[0]) * 1.34102 item1['Seller_Name'] = row['dealer']['name'] item1['Country'] = row['dealer']['country'] item1['City'] = row['dealer']['city'] item1['asking_price_inc_VAT'] = row['price'] item1['Price_Currency'] = row['formattedPrice'].split(' ')[0] item1['bodystyle'] = row['bodyStyle'] item1["colour_exterior"] = row['exterior'] item1["colour_interior"] = row['interior'] item1['Last_Code_Update_Date'] = 'June 6, 2019' item1['Scrapping_Date'] = datetime.today().strftime('%Y-%m-%d') item1['Make'] = 'Maserati' item1['vat'] = 'yes' item1["Car_URL"] = response.url item1["Car_Name"] = "Maserati" + ' ' + item1['model'] item2['src'] = "maserati.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "maserati" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item1), sort_keys=True).encode('utf-8')).hexdigest() item1['meta'] = dict(item2) item1['Source'] = item2['src'] yield item1 pass
def parse_data(self, response): item2 = MetaItem() item1 = AutodataItem() sel = Selector(response) details3 = sel.xpath('//div[@class="dealership"]') city = (''.join( details3.xpath('div[@class="address"]/p/text()').extract()).strip( )).split(',') item1['Country'] = city[len(city) - 1][1:] if (item1['Country'] == 'QA'): item1["Last_Code_Update_Date"] = "" item1["Scrapping_Date"] = "" item1["Country"] = "" item1["City"] = "" item1["Seller_Type"] = "" item1["Seller_Name"] = "" item1["Car_URL"] = "" item1["Car_Name"] = "" item1["Year"] = "" item1["Make"] = "" item1["model"] = "" item1["Spec"] = "" item1["Doors"] = "" item1["transmission"] = "" item1["trim"] = "" item1["bodystyle"] = "" item1["other_specs_gearbox"] = "" item1["other_specs_seats"] = "" item1["other_specs_engine_size"] = "" item1["other_specs_horse_power"] = "" item1["colour_exterior"] = "" item1["colour_interior"] = "" item1["fuel_type"] = "" item1["import_yes_no_also_referred_to_as_GCC_spec"] = "" item1["mileage"] = "" item1["condition"] = "" item1["warranty_untill_when"] = "" item1['service_contract_untill_when'] = '' item1['Price_Currency'] = '' item1['asking_price_inc_VAT'] = '' item1['asking_price_ex_VAT'] = '' item1['warranty'] = '' item1['service_contract'] = '' item1['vat'] = 'yes' item1['autodata_Make'] = '' item1['autodata_Make_id'] = '' item1['autodata_model'] = '' item1['autodata_model_id'] = '' item1['autodata_Spec'] = '' item1['autodata_Spec_id'] = '' item1['autodata_transmission'] = '' item1['autodata_transmission_id'] = '' item1['autodata_bodystyle'] = '' item1['autodata_bodystyle_id'] = '' item1['Car_URL'] = response.url item1['Make'] = "Audi" details1 = sel.xpath('//div[@id="vehicle_cta"]/div[1]') item1['Car_Name'] = ''.join( details1.xpath( 'div[@class="title"]/text()').extract()).strip() item1['model'] = item1['Car_Name'].split(' ')[0] item1['Price_Currency'] = (''.join( details1.xpath( 'div[@class="price"]/div[@class="cashprice"]/text()'). extract()).strip())[:3] item1['asking_price_inc_VAT'] = (''.join( details1.xpath( 'div[@class="price"]/div[@class="cashprice"]/text()'). extract()).strip())[4:] details2 = sel.xpath('//div[@class="specs"]/div[@class="item"]') for det in details2: spec_item = ''.join( det.xpath( 'div[@class="spec_item"]/text()').extract()).strip() spec_data = ''.join( det.xpath( 'div[@class="spec_data"]/text()').extract()).strip() if 'year' in spec_item.lower(): item1['Year'] = spec_data elif 'colour' in spec_item.lower(): item1['colour_exterior'] = spec_data elif 'transmission' in spec_item.lower(): item1['transmission'] = spec_data elif 'engine size' in spec_item.lower(): item1['other_specs_engine_size'] = spec_data[:-2] item1['engine_unit'] = spec_data[-2:] elif 'mileage' in spec_item.lower(): item1['mileage'] = spec_data[:-2] item1['mileage_unit'] = spec_data[-2:] elif 'fuel type' in spec_item.lower(): item1['fuel_type'] = spec_data elif 'bhp' in spec_item.lower(): item1['other_specs_horse_power'] = spec_data[:3] details3 = sel.xpath('//div[@class="dealership"]') item1['Seller_Name'] = ''.join( details3.xpath( 'div[@class="wrapper"]/h2/text()').extract()).strip() city = (''.join( details3.xpath('div[@class="address"]/p/text()').extract()). strip()).split(',') item1['City'] = city[len(city) - 2] item1['Country'] = city[len(city) - 1][1:] item2['src'] = "audiapproved.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "audi" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item1), sort_keys=True).encode('utf-8')).hexdigest() item1['meta'] = dict(item2) item1['Last_Code_Update_Date'] = 'June 6, 2019' item1['Scrapping_Date'] = datetime.today().strftime('%Y-%m-%d') item1['Source'] = item2['src'] yield item1
def parse_dir_contents(self, response): item2 = MetaItem() item1 = AutodataItem() item1["Last_Code_Update_Date"] = "" item1["Scrapping_Date"] = "" item1["Country"] = "" item1["City"] = "" item1["Seller_Type"] = "" item1["Seller_Name"] = "" item1["Car_URL"] = "" item1["Car_Name"] = "" item1["Year"] = "" item1["Make"] = "" item1["model"] = "" item1["Spec"] = "" item1["Doors"] = "" item1["transmission"] = "" item1["trim"] = "" item1["bodystyle"] = "" item1["other_specs_gearbox"] = "" item1["other_specs_seats"] = "" item1["other_specs_engine_size"] = "" item1["other_specs_horse_power"] = "" item1["colour_exterior"] = "" item1["colour_interior"] = "" item1["fuel_type"] = "" item1["import_yes_no_also_referred_to_as_GCC_spec"] = "" item1["mileage"] = "" item1["condition"] = "" item1["warranty_untill_when"] = "" item1['service_contract_untill_when'] = '' item1['Price_Currency'] = '' item1['asking_price_inc_VAT'] = '' item1['asking_price_ex_VAT'] = '' item1['warranty'] = '' item1['service_contract'] = '' item1['vat'] = 'yes' item1['mileage_unit'] = '' item1['engine_unit'] = '' item1['autodata_Make'] = '' item1['autodata_Make_id'] = '' item1['autodata_model'] = '' item1['autodata_model_id'] = '' item1['autodata_Spec'] = '' item1['autodata_Spec_id'] = '' item1['autodata_transmission'] = '' item1['autodata_transmission_id'] = '' item1['autodata_bodystyle'] = '' item1['autodata_bodystyle_id'] = '' item1['Last_Code_Update_Date'] = 'June 17, 2019' item1['Scrapping_Date'] = datetime.today().strftime('%Y-%m-%d') car_name = response.xpath("//div[contains(@class,'col_1_01')]/h1/text()").extract()[0] if car_name!= "Ramadan Deals": item1['Last_Code_Update_Date'] = "17 June 2019" item1['Scrapping_Date'] = datetime.today().strftime('%Y-%m-%d') item1['Country'] = 'Bahrain' item1['City'] = 'Manama' item1['Seller_Type'] = 'Large Independent Dealers' item1['Seller_Name'] = 'Zayani Motors' item1['Car_URL'] = response.url item1['Car_Name'] = car_name car = car_name.split()[0] if car=="MG": item1['Make'] = 'MG' item1['model']= (car_name.replace("MG",'')).strip() if car=="Lexus": item1['Make'] = "Lexus" item1['model'] = (car_name.replace("Lexus",'')).strip() mod = ["Lancer","L200","Attrage","Pajero","Outlander","Montero","Mast"] for m in mod: if m in item1['Car_Name']: item1['Make'] = "Mitsubishi" item1['model'] = m item1['Year'] = response.xpath("//div[contains(@class,'offer_data')]/ul/li/text()").extract()[0] item1['asking_price_inc_VAT']= response.xpath("//div[contains(@class,'offer_price')]/strong/text()").extract()[0].split()[0] item1['Price_Currency'] = 'BHD' item1['colour_exterior']= '' mil=(response.xpath("//div[contains(@class,'offer_data')]/ul/li/text()").extract()[1]).split()[0] if mil=="KM": item1['mileage'] = '' else: item1['mileage'] = mil item1['mileage_unit'] = 'km' item1['fuel_type']=response.xpath("//div[contains(@class,'offer_data')]/ul/li/text()").extract()[2] item2['src'] = "zmotors.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "zmotors" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5(json.dumps(dict(item1), sort_keys=True).encode('utf-8')).hexdigest() item1['meta'] = dict(item2) item1['Source'] = item2['src'] yield item1
def parse_dir_contents(self, response): #item = CadillacItem() #item2 = MetaItem() # getting make item2 = MetaItem() item = AutodataItem() item["Last_Code_Update_Date"] = "" item["Scrapping_Date"] = "" item["Country"] = "Qatar" item["City"] = "" item["Seller_Type"] = "Large Independent Dealers" item["Seller_Name"] = "Cadillac Mannai Autos" item["Car_URL"] = response.url item["Car_Name"] = "" item["Year"] = "" item["Make"] = "" item["model"] = "" item["Spec"] = "" item["Doors"] = "" item["transmission"] = "" item["trim"] = "" item["bodystyle"] = "" item["other_specs_gearbox"] = "" item["other_specs_seats"] = "" item["other_specs_engine_size"] = "" item["other_specs_horse_power"] = "" item["colour_exterior"] = "" item["colour_interior"] = "" item["fuel_type"] = "" item["import_yes_no_also_referred_to_as_GCC_spec"] = "" item["mileage"] = "" item["condition"] = "" item["warranty_untill_when"] = "" item['service_contract_untill_when'] = '' item['Price_Currency'] = '' item['asking_price_inc_VAT'] = '' item['asking_price_ex_VAT'] = '' item['warranty'] = '' item['service_contract'] = '' item['vat'] = 'yes' item['mileage_unit'] = '' item['engine_unit'] = '' item['autodata_Make'] = '' item['autodata_Make_id'] = '' item['autodata_model'] = '' item['autodata_model_id'] = '' item['autodata_Spec'] = '' item['autodata_Spec_id'] = '' item['autodata_transmission'] = '' item['autodata_transmission_id'] = '' item['autodata_bodystyle'] = '' item['autodata_bodystyle_id'] = '' item['Last_Code_Update_Date'] = 'June 6, 2019' item['Scrapping_Date'] = datetime.today().strftime('%Y-%m-%d') item2['src'] = "cadillac.mannaiautos.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "cadillac_mannaiautos" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item), sort_keys=True).encode('utf-8')).hexdigest() item['meta'] = dict(item2) item['Source'] = item2['src'] item['asking_price_inc_VAT'] = (response.xpath( "//div[contains(@class,'price-now')]/span[contains(@class,'value')]/text()" ).extract()[0]).split('QAR')[-1].strip() item['Price_Currency'] = 'QAR' item['Year'] = response.xpath( "//div[contains(@class, 'title module align-center overview-title')]/h3/span[contains(@class, 'year')]/text()" ).extract()[0].strip() item['Make'] = response.xpath( "//div[contains(@class, 'title module align-center overview-title')]/h3/span[contains(@class, 'make')]/text()" ).extract()[0].strip() item['model'] = response.xpath( "//div[contains(@class, 'title module align-center overview-title')]/h3/span[contains(@class, 'model')]/text()" ).extract()[0].strip() item['Spec'] = response.xpath( "//div[contains(@class, 'title module align-center overview-title')]/h3/span[contains(@class, 'variant')]/text()" ).extract()[0].strip() item['Car_Name'] = item['Make'] + ' ' + item['model'] + ' ' + item[ 'Spec'] item['transmission'] = response.xpath( "//div[contains(@class, 'cell transmission')]/span[contains(@class, 'value transmission')]/text()" ).extract()[0].strip() item['bodystyle'] = response.xpath( "//div[contains(@class, 'cell bodystyle')]/span[contains(@class, 'value bodystyle')]/text()" ).extract()[0].strip() item['mileage'] = ((response.xpath( "//div[contains(@class, 'cell mileage')]/span[contains(@class, 'value mileage')]/text()" ).extract()[0]).split('miles')[0]).strip() item['mileage_unit'] = 'miles' item['other_specs_engine_size'] = ((response.xpath( "//div[contains(@class, 'cell engine-size')]/span[contains(@class, 'value engine-size')]/text()" ).extract()[0]).split('l')[0]).strip() item['engine_unit'] = 'litre' item['colour_exterior'] = response.xpath( "//div[contains(@class, 'cell colour')]/span[contains(@class, 'value colour')]/text()" ).extract()[0].strip() item['fuel_type'] = response.xpath( "//div[contains(@class, 'cell fuel-type')]/span[contains(@class, 'value fuel-type')]/text()" ).extract()[0].strip() item['Doors'] = response.xpath( "//div[contains(@class, 'cell doors')]/span[contains(@class, 'value doors')]/text()" ).extract()[0].strip() item['colour_interior'] = response.xpath( "//div[contains(@class, 'cell interior-colour')]/span[contains(@class, 'value interior-colour')]/text()" ).extract()[0].strip() #item['PREVIOUS_OWNERS']=response.xpath("//div[contains(@class, 'cell previous-owners')]/span[contains(@class, 'value previous-owners')]/text()").extract()[0].strip() item['other_specs_seats'] = response.xpath( "//div[contains(@class, 'cell num-seats')]/span[contains(@class, 'value num-seats')]/text()" ).extract()[0].strip() #item['Stock_No']=response.xpath("//div[contains(@class, 'cell stock-no')]/span[contains(@class, 'value stock-no')]/text()").extract()[0].strip() #item['ADDITIONAL_INFO']=response.xpath("//div[contains(@class, 'inner')]/div[contains(@class,'description-text')]/text()").extract()[0].strip() #item2['src'] = "cadillac.mannaiautos.com" #item2['ts'] = datetime.datetime.utcnow().isoformat() #item2['name'] = "cadillac" #item2['url'] = url #item2['uid'] = str(uuid.uuid4()) #item2['cs'] = hashlib.md5(json.dumps(dict(item), sort_keys=True)).hexdigest() #extras=response.xpath("//div[contains(@class, 'extras')]/p/text()").extract() #n=len(extras) #for i in range(0,n): # a=extras[i] # item['ADDITIONAL_INFO'].append(a) yield item
def parse_dir_contents(self, response): item = AutodataItem() item2 = MetaItem() item["Last_Code_Update_Date"] = "" item["Scrapping_Date"] = "" item["Country"] = "" item["City"] = "" item["Seller_Type"] = "Large Independent Dealers" item["Seller_Name"] = "Dasweltauto" item["Car_URL"] = "" item["Car_Name"] = "" item["Year"] = "" item["Make"] = "" item["model"] = "" item["Spec"] = "" item["Doors"] = "" item["transmission"] = "" item["trim"] = "" item["bodystyle"] = "" item["other_specs_gearbox"] = "" item["other_specs_seats"] = "" item["other_specs_engine_size"] = "" item["other_specs_horse_power"] = "" item["colour_exterior"] = "" item["colour_interior"] = "" item["fuel_type"] = "" item["import_yes_no_also_referred_to_as_GCC_spec"] = "" item["mileage"] = "" item["condition"] = "" item["warranty_untill_when"] = "" item['service_contract_untill_when'] = '' item['Price_Currency'] = '' item['asking_price_inc_VAT'] = '' item['asking_price_ex_VAT'] = '' item['warranty'] = '' item['service_contract'] = '' item['vat'] = '' item['mileage_unit'] = '' item['engine_unit'] = '' item['Last_Code_Update_Date'] = 'Thursday, June 04, 2019' item['Scrapping_Date'] = datetime.today().strftime('%A, %B %d, %Y') item['autodata_Make'] = '' item['autodata_Make_id'] = '' item['autodata_model'] = '' item['autodata_model_id'] = '' item['autodata_Spec'] = '' item['autodata_Spec_id'] = '' item['autodata_transmission'] = '' item['autodata_transmission_id'] = '' item['autodata_bodystyle'] = '' item['autodata_bodystyle_id'] = '' item['wheel_size'] = '' item['top_speed_kph'] = '' item['cylinders'] = '' item['acceleration'] = '' item['torque_Nm'] = '' item2['src'] = "dasweltauto.me" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "dasweltauto_oman" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item), sort_keys=True).encode('utf-8')).hexdigest() item['meta'] = dict(item2) item['Source'] = item2['src'] item['Car_URL'] = response.url dealer = response.xpath( "//div[contains(@class,'dealership')]/div[contains(@class,'wrapper')]/h2/text()" ).get() if " Oman" in dealer: item['Make'] = 'Volkswagen' item['model'] = ((response.xpath( "//div[contains(@id,'vehicle_cta')]/div/div[contains(@class,'title')]/text()" ).extract()[0]).split()[0]).strip() item['Car_Name'] = item['Make'] + ' ' + item['model'] item['Year'] = response.xpath( "//div[contains(@class,'item')]/div[contains(@class,'spec_data')]/text()" ).extract()[0] item['asking_price_inc_VAT'] = ((response.xpath( "//div[contains(@class,'price')]/div[contains(@class,'cashprice')]/text()" ).extract()[0]).split('USD')[-1]).strip() item['Price_Currency'] = 'USD' item['colour_exterior'] = response.xpath( "//div[contains(@class,'item')]/div[contains(@class,'spec_data')]/text()" ).extract()[1] item['transmission'] = response.xpath( "//div[contains(@class,'item')]/div[contains(@class,'spec_data')]/text()" ).extract()[2].strip() item['other_specs_engine_size'] = ((response.xpath( "//div[contains(@class,'item')]/div[contains(@class,'spec_data')]/text()" ).extract()[3]).split('cc')[0]).strip() item['engine_unit'] = 'cc' item['mileage'] = ((response.xpath( "//div[contains(@class,'item')]/div[contains(@class,'spec_data')]/text()" ).extract()[4]).split('km')[0]).strip() item['mileage_unit'] = 'km' item['fuel_type'] = response.xpath( "//div[contains(@class,'item')]/div[contains(@class,'spec_data')]/text()" ).extract()[5] #item['Time_For_1_100KM']=response.xpath("//div[contains(@class,'item')]/div[contains(@class,'spec_data')]/text()").extract()[6] item['other_specs_horse_power'] = ((response.xpath( "//div[contains(@class,'item')]/div[contains(@class,'spec_data')]/text()" ).extract()[7]).split('bhp')[0]).strip() item['Country'] = 'Oman' item['acceleration'] = ((response.xpath( "//div[contains(@class,'item')]/div[contains(@class,'spec_data')]/text()" ).extract()[6]).split(' ')[0]).strip() item['City'] = 'Muscat' item['Seller_Name'] = 'Wattayah Motors Oman VW' item['Seller_Type'] = 'Large independent Dealers' #item['ADDITIONAL_INFO']=response.xpath("//div[contains(@class,'description')]/p/text()").extract() #item['FEATURES']=response.xpath("//div[contains(@class,'features')]/ul/li/text()").extract() yield item
def parse_data(self,response): item=AutodataItem() item2 = MetaItem() item["Last_Code_Update_Date"] = "" item["Scrapping_Date"] = "" item["Country"] = "KSA" item["City"] = "" item["Seller_Type"] = "Official Dealers" item["Seller_Name"] = "Universal Motor Agencies" item["Car_URL"] = "" item["Car_Name"] = "" item["Year"] = "" item["Make"] = "" item["model"] = "" item["Spec"] = "" item["Doors"] = "" item["transmission"] = "" item["trim"] = "" item["bodystyle"] = "" item["other_specs_gearbox"] = "" item["other_specs_seats"] = "" item["other_specs_engine_size"] = "" item["other_specs_horse_power"] = "" item["colour_exterior"] = "" item["colour_interior"] = "" item["fuel_type"] = "" item["import_yes_no_also_referred_to_as_GCC_spec"] = "" item["mileage"] = "" item["condition"] = "" item["warranty_untill_when"] = "" item['service_contract_untill_when'] = '' item['Price_Currency'] = '' item['asking_price_inc_VAT'] = '' item['asking_price_ex_VAT'] = '' item['warranty'] = 'yes' item['service_contract'] = '' item['vat'] = 'yes' item['mileage_unit'] = '' item['engine_unit'] = '' item['Last_Code_Update_Date'] = 'Thursday, June 04, 2019' item['Scrapping_Date'] = datetime.today().strftime('%A, %B %d, %Y') item['autodata_Make'] = '' item['autodata_Make_id'] = '' item['autodata_model'] = '' item['autodata_model_id'] = '' item['autodata_Spec'] = '' item['autodata_Spec_id'] = '' item['autodata_transmission'] = '' item['autodata_transmission_id'] = '' item['autodata_bodystyle'] = '' item['autodata_bodystyle_id'] = '' item2['src'] = "gmc.uma.com.sa" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "gmc_uma" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5(json.dumps(dict(item), sort_keys=True).encode('utf-8')).hexdigest() item['meta'] = dict(item2) item['Car_URL'] = response.url item['Source'] = item2['src'] path = Selector(response) item['Year'] = ''.join(path.xpath('//div[@class="cell reg-year"]/span[@class="value reg-year"]/text()').extract()).strip() item['other_specs_engine_size'] =''.join(path.xpath('//div[@class="cell engine-size"]/span[@class="value engine-size"]/text()').extract()).strip().split(' ')[0] item['engine_unit'] = ''.join(path.xpath('//div[@class="cell engine-size"]/span[@class="value engine-size"]/text()').extract()).strip().split(' ')[1] item['colour_exterior'] =''.join(path.xpath('//div[@class="cell colour"]/span[@class="value colour"]/text()').extract()).strip() item['bodystyle'] =''.join(path.xpath('//div[@class="cell bodystyle"]/span[@class="value bodystyle"]/text()').extract()).strip() item['mileage'] =''.join(path.xpath('//div[@class="cell mileage"]/span[@class="value mileage"]/text()').extract()).strip().split(' ')[0] item['mileage_unit'] = ''.join(path.xpath('//div[@class="cell mileage"]/span[@class="value mileage"]/text()').extract()).strip().split(' ')[1] item['model'] = ''.join(path.xpath('//div[@class="title module"]/h3/span[@class= "model"]//text()').extract()).strip() item['Year'] = ''.join(path.xpath('//div[@class="title module"]/h3/span[@class= "year"]//text()').extract()).strip() item['Make'] = ''.join(path.xpath('//div[@class="title module"]/h3/span[@class= "make"]//text()').extract()).strip() item["Spec"] = ''.join(path.xpath('//div[@class="title module"]/h3/span[@class= "variant"]//text()').extract()).strip() item['asking_price_inc_VAT'] = ''.join(path.xpath('//*[@id="content-wrap"]/div[2]/div/div[1]/div/div[2]/div/div/span[2]/text()').extract()).strip().split('SAR')[1] item['Price_Currency'] = 'SAR' item['fuel_type'] = ''.join(path.xpath('//div[@class="cell fuel-type"]/span[@class="value fuel-type"]/text()').extract()).strip() item['colour_interior'] = ''.join(path.xpath('//div[@class="cell interior-colour"]/span[@class="value interior-colour"]/text()').extract()).strip() item['transmission'] = ''.join(path.xpath('//div[@class="cell transmission"]/span[@class="value transmission"]/text()').extract()).strip() item['Doors'] = ''.join(path.xpath('//div[@class="cell doors"]/span[@class="value doors"]/text()').extract()).strip() item['Car_Name'] = (item['Make'] + ' ' +item['model'] + ' ' + item["Spec"] + ' ' + item['Year']).strip() item['warranty_untill_when'] = (datetime.today() + relativedelta(months=+24)).strftime('%Y-%m-%d') yield item
def getdata(self, response): print("******************") item = AutodataItem() item2 = MetaItem() item["Last_Code_Update_Date"] = "" item["Scrapping_Date"] = "" item["Country"] = "KSA" item["City"] = "" item["Seller_Type"] = "Official Dealers" item["Seller_Name"] = "Universal Motor Agencies" item["Car_URL"] = "" item["Car_Name"] = "" item["Year"] = "" item["Make"] = "" item["model"] = "" item["Spec"] = "" item["Doors"] = "" item["transmission"] = "" item["trim"] = "" item["bodystyle"] = "" item["other_specs_gearbox"] = "" item["other_specs_seats"] = "" item["other_specs_engine_size"] = "" item["other_specs_horse_power"] = "" item["colour_exterior"] = "" item["colour_interior"] = "" item["fuel_type"] = "" item["import_yes_no_also_referred_to_as_GCC_spec"] = "" item["mileage"] = "" item["condition"] = "" item["warranty_untill_when"] = "" item['service_contract_untill_when'] = '' item['Price_Currency'] = '' item['asking_price_inc_VAT'] = '' item['asking_price_ex_VAT'] = '' item['warranty'] = 'yes' item['service_contract'] = '' item['vat'] = 'yes' item['mileage_unit'] = '' item['engine_unit'] = '' item['Last_Code_Update_Date'] = 'Thursday, June 07, 2019' item['Scrapping_Date'] = datetime.today().strftime('%A, %B %d, %Y') item['autodata_Make'] = '' item['autodata_Make_id'] = '' item['autodata_model'] = '' item['autodata_model_id'] = '' item['autodata_Spec'] = '' item['autodata_Spec_id'] = '' item['autodata_transmission'] = '' item['autodata_transmission_id'] = '' item['autodata_bodystyle'] = '' item['autodata_bodystyle_id'] = '' item2['src'] = "approved.me.jaguar.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "jaguar_sa" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item), sort_keys=True).encode('utf-8')).hexdigest() item['meta'] = dict(item2) item['Car_URL'] = response.url item['Source'] = item2['src'] item["Country"] = "Saudi Arabia" c = 0 d = 0 item["Seller_Type"] = "Large Independent Dealers" item["Seller_Name"] = "MOHAMED YOUSUF NAGHI MOTORS" item["Car_URL"] = response.url name = response.xpath( "//hgroup/h1[contains(@class,'section-title')]/text()").get( ).split()[0] arr = response.xpath("//tr/td/text()").extract() key = response.xpath("//tr/th/text()").extract() for k in range(len(key)): if 'Model Year' in key[k]: item["Year"] = arr[k] elif 'Exterior' in key[k]: item["colour_exterior"] = arr[k] elif 'Interior' in key[k]: item["colour_interior"] = arr[k] elif 'Kilometers' in key[k]: item['mileage'] = arr[k].split(' ')[0] item['mileage_unit'] = arr[k].split(' ')[-1] elif 'Transmission' in key[k]: item["transmission"] = arr[k].split(' ')[-1] elif 'Bodystyle' in key[k]: item["bodystyle"] = arr[k].split(' ')[-1] #item["Doors"] = arr[k].split(' ')[0] d = k elif 'Engine' in key[k]: item["other_specs_engine_size"] = arr[k].split(' ')[0] item['cylinders'] = arr[k].split(' ')[1] c = k item['engine_unit'] = 'l' elif 'Fuel Type' in key[k]: item["fuel_type"] = arr[k] elif 'Location' in key[k]: item["City"] = arr[k] item["Make"] = "Jaguar" item["Car_Name"] = item["Make"] + ' ' + ''.join( response.xpath('//hgroup/h1[@class="section-title"]/text()'). extract()).replace(arr[c].upper() + ' ', '').replace( arr[d].upper(), '').strip() item['Price_Currency'] = 'SAR' item['asking_price_inc_VAT'] = response.xpath( "//strong[contains(@class,'price-box')]/text()").get().split( 'SAR')[-1].strip() yield item
def parse_dir_contents(self, response): item = AutodataItem() item2 = MetaItem() item["Last_Code_Update_Date"] = "" item["Scrapping_Date"] = "" item["Country"] = "Oman" item["City"] = "" item["Seller_Type"] = "" item["Seller_Name"] = "" item["Car_URL"] = response.url item["Car_Name"] = "" item["Year"] = "" item["Make"] = "" item["model"] = "" item["Spec"] = "" item["Doors"] = "" item["transmission"] = "" item["trim"] = "" item["bodystyle"] = "" item["other_specs_gearbox"] = "" item["other_specs_seats"] = "" item["other_specs_engine_size"] = "" item["other_specs_horse_power"] = "" item["colour_exterior"] = "" item["colour_interior"] = "" item["fuel_type"] = "" item["import_yes_no_also_referred_to_as_GCC_spec"] = "" item["mileage"] = "" item["condition"] = "" item["warranty_untill_when"] = "" item['service_contract_untill_when'] = '' item['Price_Currency'] = '' item['asking_price_inc_VAT'] = '' item['asking_price_ex_VAT'] = '' item['warranty'] = '' item['service_contract'] = '' item['vat'] = 'yes' item['mileage_unit'] = '' item['engine_unit'] = '' item['Last_Code_Update_Date'] = 'Thursday, June 04, 2019' item['Scrapping_Date'] = datetime.today().strftime('%A, %B %d, %Y') item['autodata_Make'] = '' item['autodata_Make_id'] = '' item['autodata_model'] = '' item['autodata_model_id'] = '' item['autodata_Spec'] = '' item['autodata_Spec_id'] = '' item['autodata_transmission'] = '' item['autodata_transmission_id'] = '' item['autodata_bodystyle'] = '' item['autodata_bodystyle_id'] = '' item['wheel_size'] = '' item['top_speed_kph'] = '' item['cylinders'] = '' item['acceleration'] = '' item['torque_Nm'] = '' item2['src'] = "oman.pe-mb.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "omanpe" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item), sort_keys=True).encode('utf-8')).hexdigest() item['meta'] = dict(item2) item['Source'] = item2['src'] item['Year'] = (( response.xpath("//ul[contains(@class, 'left')]/li/text()").extract( )[1]).split(':')[-1]).strip() item['Make'] = 'Mercedes-Benz' mod = (response.xpath( "//div[contains(@class, 'col_8 content car-detail')]/h2/text()"). extract()[0]).split() item['model'] = str.join('', (mod[0], mod[1])) item['Car_Name'] = item['Make'] + ' ' + item['model'] item['transmission'] = (( response.xpath("//ul[contains(@class, 'left')]/li/text()").extract( )[5]).split(':')[-1]).strip() item['trim'] = (( response.xpath("//ul[contains(@class, 'right')]/li/text()" ).extract()[2]).split(':')[-1]).strip() item['bodystyle'] = (( response.xpath("//ul[contains(@class, 'left')]/li/text()").extract( )[4]).split(':')[-1]).strip() if '/' in item['bodystyle']: item['bodystyle'] = 'Coupe/Cabriolet' item['wheel_size'] = ''.join( response.xpath( '//div[@class="resp-tabs-container"]/div[2]/ul/li[4]/text()'). extract()).strip().split(' ')[-1] item['warranty'] = ''.join( response.xpath( '//div[@class="resp-tabs-container"]/div[2]/ul/li[1]/text()'). extract()).split('Warranty : ')[-1].strip().split(',')[0] item['service_contract'] = ''.join( response.xpath( '//div[@class="resp-tabs-container"]/div[2]/ul/li[2]/text()'). extract()).strip().split(' ')[-1] item['colour_exterior'] = (( response.xpath("//ul[contains(@class, 'left')]/li/text()").extract( )[3]).split(':')[-1]).strip() item['mileage'] = (( response.xpath("//ul[contains(@class, 'left')]/li/text()").extract( )[2]).split(':')[-1]).strip() item['mileage_unit'] = 'km' item['asking_price_inc_VAT'] = (( response.xpath("//div[contains(@class,'price')]/h3/text()" ).extract()[0]).split('OMR')[-1]).strip() item['Price_Currency'] = 'OMR' item['Country'] = (( response.xpath("//ul[contains(@class, 'right')]/li/text()" ).extract()[0]).split(':')[-1]).strip() item['Seller_Name'] = (( response.xpath("//ul[contains(@class, 'right')]/li/text()" ).extract()[1]).split(':')[-1]).strip() yield item
def parse_data(self, response): #item1 = AutodataItem() item2 = MetaItem() item1 = AutodataItem() item1["Last_Code_Update_Date"] = "" item1["Scrapping_Date"] = "" item1["Country"] = "" item1["City"] = "" item1["Seller_Type"] = "" item1["Seller_Name"] = "" item1["Car_URL"] = "" item1["Car_Name"] = "" item1["Year"] = "" item1["Make"] = "" item1["model"] = "" item1["Spec"] = "" item1["Doors"] = "" item1["transmission"] = "" item1["trim"] = "" item1["bodystyle"] = "" item1["other_specs_gearbox"] = "" item1["other_specs_seats"] = "" item1["other_specs_engine_size"] = "" item1["other_specs_horse_power"] = "" item1["colour_exterior"] = "" item1["colour_interior"] = "" item1["fuel_type"] = "" item1["import_yes_no_also_referred_to_as_GCC_spec"] = "" item1["mileage"] = "" item1["condition"] = "" item1["warranty_untill_when"] = "" item1['service_contract_untill_when'] = '' item1['Price_Currency'] = '' item1['asking_price_inc_VAT'] = '' item1['asking_price_ex_VAT'] = '' item1['warranty'] = '' item1['service_contract'] = '' item1['vat'] = 'yess' item1['mileage_unit'] = '' item1['engine_unit'] = '' item1['autodata_Make'] = '' item1['autodata_Make_id'] = '' item1['autodata_model'] = '' item1['autodata_model_id'] = '' item1['autodata_Spec'] = '' item1['autodata_Spec_id'] = '' item1['autodata_transmission'] = '' item1['autodata_transmission_id'] = '' item1['autodata_bodystyle'] = '' item1['autodata_bodystyle_id'] = '' item1['wheel_size'] = '' item1['top_speed_kph'] = '' item1['cylinders'] = '' item1['acceleration'] = '' item1['torque_Nm'] = '' sel = Selector(response) item1['Last_Code_Update_Date'] = 'Thursday, June 04, 2019' item1['Scrapping_Date'] = datetime.today().strftime('%A, %B %d, %Y') item1["Car_URL"] = response.url item1["Country"] = "UAE" item1["City"] = "Dubai" item1["Make"] = "BMW" item1["Car_Name"] = "BMW " + ''.join( sel.xpath( '//div[@class="col-flex pdp-details-top-title"]/h2/text()'). extract()).strip() #item1["model"] = item1["Car_Name"].split(' ')[1] #item1["Spec"] = ' '.join(item1["Car_Name"].split(' ')[2:]) item1["Price_Currency"] = ''.join( sel.xpath( '//div[@class="pdp-details-top-info-price"]/h3/span//text()'). extract()).strip().replace(u'\xa0', u' ').split(' ')[0] item1["asking_price_inc_VAT"] = ''.join( sel.xpath( '//div[@class="pdp-details-top-info-price"]/h3/span//text()'). extract()).strip().replace(u'\xa0', u' ').split(' ')[1] details_panels = sel.xpath( '//div[@class="vehicle-details-panel"]/div[@class="vehicle-details-panel-feature"]' ) print(len(details_panels)) for det in details_panels: name = ''.join(det.xpath('div[1]//text()').extract()).strip() value = ''.join(det.xpath('div[2]//text()').extract()).strip() #print(name) if name.lower() == "model year": item1['Year'] = value elif name.lower() == "transmission": item1['transmission'] = value elif name.lower() == "basic paintwork": item1['colour_exterior'] = value elif name.lower() == "number of doors": item1['Doors'] = value elif name.lower() == "mileage": item1['mileage'] = value.replace(u'\xa0', u' ').split(' ')[0] item1['mileage_unit'] = value.replace(u'\xa0', u' ').split(' ')[1] elif name.lower() == "model": item1['model'] = value elif name.lower() == "body type": item1['bodystyle'] = value.replace('Coupé', 'Coupe') elif name.lower() == "fuel type": item1['fuel_type'] = value elif name.lower() == "engine power(hp)": item1['other_specs_horse_power'] = value.replace( u'\xa0', u' ').split(' ')[0] elif name.lower() == "number of seats": item1['other_specs_seats'] = value elif name.lower() == "upholstery type": item1['trim'] = value elif name.lower() == "warranty in months": item1['warranty_untill_when'] = ( datetime.today() + relativedelta(months=+int(value))).strftime('%Y-%m-%d') elif name == "Number of Cylinders": item1['cylinders'] = value elif name.lower() == "cylinder capacity": item1['other_specs_engine_size'] = int( value.replace(u'\xa0', u' ').split(' ')[0]) item1['engine_unit'] = 'cc' #item1['other_specs_engine_size'] = int(value.replace(u'\xa0', u' ').split(' ')[0]) * 0.001 if item1['warranty_untill_when'] != "": item1['warranty'] = "yes" item1["Seller_Name"] = ''.join( sel.xpath( '//p[@class="plan-route-map-container-dealer-info-name"]/text()' ).extract()).strip() item2['src'] = "bmw-dubai.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "bmw" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item1), sort_keys=True).encode('utf-8')).hexdigest() item1['meta'] = dict(item2) item1['Source'] = item2['src'] yield item1
def parse_data(self, response): item = AutodataItem() item2 = MetaItem() item["Last_Code_Update_Date"] = "" item["Scrapping_Date"] = "" item["Country"] = "" item["City"] = "" item["Seller_Type"] = "" item["Seller_Name"] = "" item["Car_URL"] = "" item["Car_Name"] = "" item["Year"] = "" item["Make"] = "" item["model"] = "" item["Spec"] = "" item["Doors"] = "" item["transmission"] = "" item["trim"] = "" item["bodystyle"] = "" item["other_specs_gearbox"] = "" item["other_specs_seats"] = "" item["other_specs_engine_size"] = "" item["other_specs_horse_power"] = "" item["colour_exterior"] = "" item["colour_interior"] = "" item["fuel_type"] = "" item["import_yes_no_also_referred_to_as_GCC_spec"] = "" item["mileage"] = "" item["condition"] = "" item["warranty_untill_when"] = "" item['service_contract_untill_when'] = '' item['Price_Currency'] = '' item['asking_price_inc_VAT'] = '' item['asking_price_ex_VAT'] = '' item['warranty'] = '' item['service_contract'] = '' item['vat'] = 'yes' item['mileage_unit'] = '' item['engine_unit'] = '' item['autodata_Make'] = '' item['autodata_Make_id'] = '' item['autodata_model'] = '' item['autodata_model_id'] = '' item['autodata_Spec'] = '' item['autodata_Spec_id'] = '' item['autodata_transmission'] = '' item['autodata_transmission_id'] = '' item['autodata_bodystyle'] = '' item['autodata_bodystyle_id'] = '' item["Last_Code_Update_Date"] = "Wednesday,June 19,2019" item["Scrapping_Date"] = datetime.today().strftime('%A, %B %d, %Y') item["Country"] = "UAE" item["City"] = "Dubai" item["Seller_Type"] = "Large Independent Dealers" item["Seller_Name"] = "Pearl Motors" item["Car_URL"] = response.url item['asking_price_inc_VAT'] = response.xpath( "//div[contains(@class,'car-price')]/h2/text()").extract( )[0].split('AED')[-1].strip().replace(',', '') item['Price_Currency'] = 'AED' item["Car_Name"] = response.xpath( "//span[contains(@class,'underline text-black')]/text()").extract( )[0].strip() if "ROLLS ROYCE" in item["Car_Name"]: item["Make"] = "ROLLS ROYCE" item["model"] = item["Car_Name"].replace('ROLLS ROYCE', '').strip() if "FERRARI" in item["Car_Name"]: item["Make"] = "FERRARI" item["model"] = item["Car_Name"].replace('FERRARI', '').strip() if "MERCEDES" in item["Car_Name"]: item["Make"] = "MERCEDES" item["model"] = item["Car_Name"].replace('MERCEDES BENZ', '').strip() if "MASERATI" in item["Car_Name"]: item["Make"] = "MASERATI" item["model"] = item["Car_Name"].replace('MASERATI', '').strip() if "BENTLEY" in item["Car_Name"]: item["Make"] = "BENTLEY" item["model"] = item["Car_Name"].replace('BENTLEY', '').strip() if "LAMBORGHINI" in item["Car_Name"]: item["Make"] = "LAMBORGHINI" item["model"] = item["Car_Name"].replace('LAMBORGHINI', '').strip() if "MCLAREN" in item["Car_Name"]: item["Make"] = "MCLAREN" item["model"] = item["Car_Name"].replace('MCLAREN', '').strip() if "ASTON MARTIN" in item["Car_Name"]: item["Make"] = "ASTON MARTIN" item["model"] = item["Car_Name"].replace('ASTON MARTIN', '').strip() if "RANGE ROVER" in item["Car_Name"]: item["Make"] = "LAND ROVER" item["model"] = "RANGE ROVER" item["Spec"] = item["Car_Name"].replace('RANGE ROVER', '').strip() arr = response.xpath("//tr/td/text()").extract() '''item["Year"] = arr[1] item["mileage"] = arr[7] item['mileage_unit'] = 'KM' item["other_specs_engine_size"] = arr[9].split('L')[0] item['engine_unit'] ='L''' arrs = list(OrderedDict.fromkeys(arr)) for i in range(0, len(arrs)): if arrs[i] == 'Year': item["Year"] = arr[i + 1] if arr[i] == 'Kilometers': item["mileage"] = arr[i + 1] item['mileage_unit'] = 'KM' if arrs[i] == 'Engine': item["other_specs_engine_size"] = arrs[i + 1].split('L')[0] item['engine_unit'] = 'L' if arr[i] == 'Horsepower': item["other_specs_horse_power"] = arr[i + 1] if arrs[i] == 'Fuel Type': item["fuel_type"] = arrs[i + 1] if arrs[i] == 'Warranty': item['warranty'] = arrs[i + 1] if arrs[i] == 'Motors Trim': item['bodystyle'] = arrs[i + 1] item2['src'] = "pearl-motors.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "pearl" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item), sort_keys=True).encode('utf-8')).hexdigest() item['meta'] = dict(item2) item['Source'] = item2['src'] yield item
def parse_data(self, response): item2 = MetaItem() item1 = AutodataItem() item1["Last_Code_Update_Date"] = "" item1["Scrapping_Date"] = "" item1["Country"] = "Saudi Arabia" item1["City"] = "" item1["Seller_Type"] = "Large Independent Dealers" item1["Seller_Name"] = "Mas Cars" item1["Car_URL"] = response.url item1["Car_Name"] = ''.join(response.xpath('//span[@class="inner-nice-model-title"]/text()').extract()).strip() item1["Year"] = "" item1["Make"] = "" item1["model"] = ''.join(response.xpath('//div[@class="internal_details"]//ul//span[@id = "get-my-model"]/text()').extract()).strip() item1["Spec"] = "" item1["Doors"] = "" item1["transmission"] = "" item1["trim"] = "" item1["bodystyle"] = "" item1["other_specs_gearbox"] = "" item1["other_specs_seats"] = "" item1["other_specs_engine_size"] = "" item1["other_specs_horse_power"] = "" item1["colour_exterior"] = "" item1["colour_interior"] = "" item1["fuel_type"] = "" item1["import_yes_no_also_referred_to_as_GCC_spec"] = "" item1["mileage"] = "" item1["condition"] = "" item1["warranty_untill_when"] = "" item1['service_contract_untill_when'] = '' item1['Price_Currency'] = '' item1['asking_price_inc_VAT'] = '' item1['asking_price_ex_VAT'] = '' item1['warranty'] = '' item1['service_contract'] = '' item1['vat'] = 'yes' item1['mileage_unit'] = '' item1['engine_unit'] = '' item1['autodata_Make'] = '' item1['autodata_Make_id'] = '' item1['autodata_model'] = '' item1['autodata_model_id'] = '' item1['autodata_Spec'] = '' item1['autodata_Spec_id'] = '' item1['autodata_transmission'] = '' item1['autodata_transmission_id'] = '' item1['autodata_bodystyle'] = '' item1['autodata_bodystyle_id'] = '' item1["Make"] = ''.join(response.xpath('//div[@class="internal_details"]//ul//span[@id = "get-my-manufac"]/text()').extract()).strip() details = response.xpath('//div[@class="internal_details"]//ul/li') for det in details: key = ''.join(det.xpath('strong[1]/text()').extract()).strip() value = ''.join(det.xpath('span[1]/text()').extract()).strip() if key == "Year:": item1['Year'] = value elif key == "Car Color:": item1["colour_exterior"] = value elif key == "Internal Color:": item1["colour_interior"] = value elif key == "Transmission:": item1["transmission"] = value elif key == "Engine:": item1["mileage"] = value item1["mileage_unit"] = 'km' elif key == "Price:": item1["asking_price_inc_VAT"] = value.split(' ')[0] item1["Price_Currency"] = value.split(' ')[1] elif key == "Warrenty:": item1["warranty"] = value ## elif key == "Type:": ## item1["colour_exterior"] = value item1["Car_Name"] = item1['Make'] + ' ' + item1['model'] item2['src'] = "www.mascars.net" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "mascars" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5(json.dumps(dict(item1), sort_keys=True).encode('utf-8')).hexdigest() item1['meta'] = dict(item2) item1['Last_Code_Update_Date'] = 'Tuesday, June 18, 2019' item1['Scrapping_Date'] = datetime.today().strftime('%A, %B %d, %Y') #item1['Seller_Name'] = 'Universal Motors Agencies' item1['Source'] = item2['src'] yield item1
def parse_data(self, response): item2 = MetaItem() item1 = AutodataItem() item1["Last_Code_Update_Date"] = "" item1["Scrapping_Date"] = "" item1["Country"] = "Saudi Arabia" item1["City"] = "" item1["Seller_Type"] = "Market Places" #item1["Seller_Name"] = "" item1["Seller_Name"] = "".join( response.xpath('//div[@class="seller_logo_main clearfix"]/text()'). extract()).strip().split('-')[0].replace('Dealer:', '').strip() item1["Seller_Name"] = re.sub('[^0-9a-zA-Z]+', ' ', item1["Seller_Name"]).strip() item1["Car_URL"] = response.url item1["Car_Name"] = ''.join( response.xpath( '//h1[@class="vif_heading"]/text()').extract()).strip() item1["Year"] = "" item1["Make"] = "" item1["model"] = "" item1["Spec"] = "" item1["Doors"] = "" item1["transmission"] = "" item1["trim"] = "" item1["bodystyle"] = "" item1["other_specs_gearbox"] = "" item1["other_specs_seats"] = "" item1["other_specs_engine_size"] = "" item1["other_specs_horse_power"] = "" item1["colour_exterior"] = "" item1["colour_interior"] = "" item1["fuel_type"] = "" item1["import_yes_no_also_referred_to_as_GCC_spec"] = "" item1["mileage"] = "" item1["condition"] = "" item1["warranty_untill_when"] = "" item1['service_contract_untill_when'] = '' item1['Price_Currency'] = '' item1['asking_price_inc_VAT'] = '' item1['asking_price_ex_VAT'] = '' item1['warranty'] = '' item1['service_contract'] = '' item1['vat'] = 'yes' item1['mileage_unit'] = '' item1['engine_unit'] = '' item1['autodata_Make'] = '' item1['autodata_Make_id'] = '' item1['autodata_model'] = '' item1['autodata_model_id'] = '' item1['autodata_Spec'] = '' item1['autodata_Spec_id'] = '' item1['autodata_transmission'] = '' item1['autodata_transmission_id'] = '' item1['autodata_bodystyle'] = '' item1['autodata_bodystyle_id'] = '' details = response.xpath('//div[@class="vif_info"]') item1['Price_Currency'] = 'SAR' item1['asking_price_inc_VAT'] = ''.join( details.xpath('h3/text()').extract()).strip().split( 'SAR')[0].strip() dets = details.xpath('ul/li') for det in dets: key = ''.join(det.xpath('text()').extract()).strip().split(':')[0] value = ''.join( det.xpath('text()').extract()).strip().split(':')[1].strip() #print(key, value) if key == 'Make': item1['Make'] = value elif key == 'Model': item1['model'] = value elif key == 'Year': item1['Year'] = value elif key == 'Mileage': item1['mileage'] = value.split(' ')[0] item1['mileage_unit'] = 'km' elif key == 'City': item1['City'] = value elif key == 'Color': item1['colour_exterior'] = value elif key == 'Engine size': item1['other_specs_engine_size'] = value elif key == 'Gearbox': item1['transmission'] = value item2['src'] = "www.abisayara.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "abisayara" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item1), sort_keys=True).encode('utf-8')).hexdigest() item1['meta'] = dict(item2) item1['Last_Code_Update_Date'] = 'Tuesday, June 18, 2019' item1['Scrapping_Date'] = datetime.today().strftime('%A, %B %d, %Y') item1['Source'] = item2['src'] yield item1
def parse_dir_contents(self, response): item = AutodataItem() item2 = MetaItem() item["Last_Code_Update_Date"] = "" item["Scrapping_Date"] = "" item["Country"] = "" item["City"] = "" item["Seller_Type"] = "" item["Seller_Name"] = "" item["Car_URL"] = "" item["Car_Name"] = "" item["Year"] = "" item["Make"] = "" item["model"] = "" item["Spec"] = "" item["Doors"] = "" item["transmission"] = "" item["trim"] = "" item["bodystyle"] = "" item["other_specs_gearbox"] = "" item["other_specs_seats"] = "" item["other_specs_engine_size"] = "" item["other_specs_horse_power"] = "" item["colour_exterior"] = "" item["colour_interior"] = "" item["fuel_type"] = "" item["import_yes_no_also_referred_to_as_GCC_spec"] = "" item["mileage"] = "" item["condition"] = "" item["warranty_untill_when"] = "" item['service_contract_untill_when'] = '' item['Price_Currency'] = '' item['asking_price_inc_VAT'] = '' item['asking_price_ex_VAT'] = '' item['warranty'] = '' item['service_contract'] = '' item['vat'] = 'yes' item['mileage_unit'] = '' item['engine_unit'] = '' item['Last_Code_Update_Date'] = 'June 15, 2019' item['Scrapping_Date'] = datetime.today().strftime('%Y-%m-%d') item['autodata_Make'] = '' item['autodata_Make_id'] = '' item['autodata_model'] = '' item['autodata_model_id'] = '' item['autodata_Spec'] = '' item['autodata_Spec_id'] = '' item['autodata_transmission'] = '' item['autodata_transmission_id'] = '' item['autodata_bodystyle'] = '' item['autodata_bodystyle_id'] = '' item['wheel_size'] = '' item2['src'] = "abu-dhabi.pe-mb.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "abudhabi" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item), sort_keys=True).encode('utf-8')).hexdigest() item['meta'] = dict(item2) item['Car_URL'] = response.url item['Source'] = item2['src'] # getting make #item['Car_Name'] = response.xpath("//div[contains(@class, 'col_8 content car-detail')]/h2/text()").extract()[0].strip() item['Year'] = (( response.xpath("//ul[contains(@class, 'left')]/li/text()").extract( )[1]).split(':')[-1]).strip() item['Make'] = 'Mercedes-Benz' mod = (response.xpath( "//div[contains(@class, 'col_8 content car-detail')]/h2/text()"). extract()[0]).split() item['model'] = str.join('', (mod[0], mod[1])) #item['Spec'] = #item['Doors'] = item['Car_Name'] = item['Make'] + ' ' + item['model'] size = len( response.xpath( "//ul[contains(@class, 'left')]/li/text()").extract()) if size > 5: item['transmission'] = (( response.xpath("//ul[contains(@class, 'left')]/li/text()" ).extract()[5]).split(':')[-1]).strip() item['trim'] = (( response.xpath("//ul[contains(@class, 'right')]/li/text()" ).extract()[2]).split(':')[-1]).strip() item['bodystyle'] = (( response.xpath("//ul[contains(@class, 'left')]/li/text()").extract( )[4]).split(':')[-1]).strip() #item['other_specs_gearbox'] = #item['other_specs_seats'] = #item['other_specs_engine_size'] = #item['other_specs_horse_power'] = item['colour_exterior'] = (( response.xpath("//ul[contains(@class, 'left')]/li/text()").extract( )[3]).split(':')[-1]).strip() #item['fuel_type'] = item['mileage'] = (( response.xpath("//ul[contains(@class, 'left')]/li/text()").extract( )[2]).split(':')[-1]).strip() item['mileage_unit'] = 'km' item['asking_price_inc_VAT'] = (( response.xpath("//div[contains(@class,'price')]/h3/text()" ).extract()[0]).split('AED')[-1]).strip() item['Price_Currency'] = 'AED' item['Country'] = 'UAE' item['City'] = (( response.xpath("//ul[contains(@class, 'right')]/li/text()" ).extract()[0]).split(':')[-1]).strip() item['Seller_Name'] = (( response.xpath("//ul[contains(@class, 'right')]/li/text()" ).extract()[1]).split(':')[-1]).strip() arr = response.xpath("//ul/li/text()").extract() for i in range(0, len(arr)): if "Warranty :" in arr[i]: war = ((arr[i]).split('Warranty :')[-1]).strip() item['warranty'] = (war.split(',')[0]).strip() item['warranty_untill_when'] = (war.split(',')[-1]).strip() elif "Wheel Size" in arr[i]: item['wheel_size'] = (( arr[i]).split('Wheel Size :')[-1]).strip() elif "With a service package" in arr[i]: item['service_contract'] = (( arr[i]).split('With a service package :')[-1]).strip() item['warranty_untill_when'] = (datetime.today() + relativedelta( years=+int(item['warranty_untill_when'].split(' ')[0])) ).strftime('%Y-%m-%d') yield item
def parse_data(self, response): item=AutodataItem() item2 = MetaItem() item["Last_Code_Update_Date"] = "" item["Scrapping_Date"] = "" item["Country"] = "" item["City"] = "" item["Seller_Type"] = "" item["Seller_Name"] = "" item["Car_URL"] = "" item["Car_Name"] = "" item["Year"] = "" item["Make"] = "" item["model"] = "" item["Spec"] = "" item["Doors"] = "" item["transmission"] = "" item["trim"] = "" item["bodystyle"] = "" item["other_specs_gearbox"] = "" item["other_specs_seats"] = "" item["other_specs_engine_size"] = "" item["other_specs_horse_power"] = "" item["colour_exterior"] = "" item["colour_interior"] = "" item["fuel_type"] = "" item["import_yes_no_also_referred_to_as_GCC_spec"] = "" item["mileage"] = "" item["condition"] = "" item["warranty_untill_when"] = "" item['service_contract_untill_when'] = '' item['Price_Currency'] = '' item['asking_price_inc_VAT'] = '' item['asking_price_ex_VAT'] = '' item['warranty'] = '' item['service_contract'] = '' item['vat'] = 'yes' item['mileage_unit'] = '' item['engine_unit'] = '' item['autodata_Make'] = '' item['autodata_Make_id'] = '' item['autodata_model'] = '' item['autodata_model_id'] = '' item['autodata_Spec'] = '' item['autodata_Spec_id'] = '' item['autodata_transmission'] = '' item['autodata_transmission_id'] = '' item['autodata_bodystyle'] = '' item['autodata_bodystyle_id'] = '' item["Last_Code_Update_Date"] = "Wednesday,June 19,2019" item["Scrapping_Date"] = datetime.today().strftime('%A, %B %d, %Y') item["Country"] = "UAE" item["City"] = "Dubai" item["Seller_Type"] = "Large Independent Dealers" item["Seller_Name"] = "Sun City Motors" item["Car_URL"] = response.url item['asking_price_inc_VAT'] = response.xpath("//span[contains(@class,'price_figure ')]/text()").extract()[0].split('AED')[-1].strip() item['Price_Currency'] = 'AED' item["Car_Name"] = response.xpath("//div[contains(@class,'single_page_title hidden-sm hidden-xs')]/h1/text()").extract()[0].strip() if "BMW" in item["Car_Name"]: item["Make"] = "BMW" item["model"] = item["Car_Name"].split()[1] if "AUDI" in item["Car_Name"]: item["Make"] = "AUDI" item["model"] = item["Car_Name"].split()[1] sp = item["Car_Name"].replace('AUDI','') item["Spec"] = sp.replace(item["model"],'').strip() if "RANGE ROVER" in item["Car_Name"]: item["Make"] = "LAND ROVER" item["model"] = "RANGE ROVER" item["Spec"] = item["Car_Name"].replace('RANGE ROVER','').strip() if "DODGE" in item["Car_Name"]: item["Make"] = "DODGE" item["model"] = item["Car_Name"].replace('DODGE','').strip() if "LAND ROVER" in item["Car_Name"]: item["Make"] = "LAND ROVER" item["model"] = item["Car_Name"].replace('LAND ROVER','').strip() if "MASERATI" in item["Car_Name"]: item["Make"] = "MASERATI" item["model"] = item["Car_Name"].split()[1] item["Spec"] = " ".join(re.findall("[a-zA-Z]+",item["Car_Name"].split()[2])) if "PORSCHE" in item["Car_Name"]: item["Make"] = "PORSCHE" item["model"] = item["Car_Name"].split()[1] sp = item["Car_Name"].replace('PORSCHE','') item["Spec"] = sp.replace(item["model"],'').strip() if "CHEVROLET" in item["Car_Name"]: item["Make"] = "CHEVROLET" item["model"] = item["Car_Name"].split()[1] item["Spec"] = item["Car_Name"].split()[2] if "JAGUAR" in item["Car_Name"]: item["Make"] = "JAGUAR" item["model"] = item["Car_Name"].replace('JAGUAR','').strip() if "FORD" in item["Car_Name"]: item["Make"] = "FORD" item["model"] = item["Car_Name"].split()[1] sp = item["Car_Name"].replace('FORD','') item["Spec"] = sp.replace(item["model"],'').strip() if "JEEP" in item["Car_Name"]: item["Make"] = "JEEP" item["model"] = item["Car_Name"].split()[1] sp = item["Car_Name"].replace('JEEP','') item["Spec"] = sp.replace(item["model"],'').strip() if "MERCEDES-BENZ" in item["Car_Name"]: item["Make"] = "MERCEDES" item["model"] = item["Car_Name"].replace('MERCEDES-BENZ','').strip() if "NISSAN" in item["Car_Name"]: item["Make"] = "NISSAN" item["model"] = item["Car_Name"].replace('NISSAN','').strip() arr = response.xpath("//tr/td/text()").extract() item["Year"] = arr[1] item["mileage"] = arr[7] item['mileage_unit'] = 'KM' item["other_specs_engine_size"] = arr[9].split('L')[0] item['engine_unit'] ='L' arrs = list(OrderedDict.fromkeys(arr)) for i in range(0,len(arrs)): '''if arrs[i]=='Year': item["Year"] = arr[i+1] if arr[i]=='Kilometers': item["mileage"] = arr[i+1] item['mileage_unit'] = 'KM' if arr[i]=='Engine': item["other_specs_engine_size"] = arr[i+1].split('L')[0] item['engine_unit'] ='L' ''' if arr[i]=='Horsepower': item["other_specs_horse_power"] =arr[i+1] if arrs[i]=='Exterior Color': item["colour_exterior"] = arrs[i+1] if arrs[i]=='Fuel Type': item["fuel_type"] = arrs[i+1] if arrs[i]=='Warranty': item['warranty'] = arrs[i+1] item2['src'] = "suncity.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "suncity_spider" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5(json.dumps(dict(item), sort_keys=True).encode('utf-8')).hexdigest() item['meta'] = dict(item2) item['Source'] = item2['src'] yield item
def parse_data(self, response): print("DONE") item = AutodataItem() item2 = MetaItem() item["Last_Code_Update_Date"] = "" item["Scrapping_Date"] = "" item["Country"] = "Bahrain" item["City"] = "" item["Seller_Type"] = "Market Places" item["Seller_Name"] = "Yallamotors" item["Car_URL"] = response.url item["Car_Name"] = ''.join( response.xpath( '//*[@id="mainContent"]/section[3]/div/div/div[1]/h1/text()'). extract()).replace('Used ', '').strip() item["Year"] = "" item["Make"] = item["Car_Name"].split(' ')[0] item["model"] = item["Car_Name"].split(' ')[1] item["Spec"] = "" item["Doors"] = "" item["transmission"] = "" item["trim"] = "" item["bodystyle"] = "" item["other_specs_gearbox"] = "" item["other_specs_seats"] = "" item["other_specs_engine_size"] = "" item["other_specs_horse_power"] = "" item["colour_exterior"] = "" item["colour_interior"] = "" item["fuel_type"] = "" item["import_yes_no_also_referred_to_as_GCC_spec"] = "" item["mileage"] = "" item["condition"] = "" item["warranty_untill_when"] = "" item['service_contract_untill_when'] = '' item['Price_Currency'] = 'BHD' item['asking_price_inc_VAT'] = ''.join( response.xpath( '//span[@class="price-count h3 green bold block"]/text()'). extract()).strip() item['asking_price_ex_VAT'] = ''.join( response.xpath( "div[@class = 'col-md-3 used-car-user-info']/span[@class = 'price-count h3 green bold block']/span[@class ='price-count_small']/text()" ).extract()).strip() item['warranty'] = '' item['service_contract'] = '' item['vat'] = 'yes' item['mileage_unit'] = 'km' item['engine_unit'] = '' item['Last_Code_Update_Date'] = 'Thursday, June 07, 2019' item['Scrapping_Date'] = datetime.today().strftime('%A, %B %d, %Y') item['autodata_Make'] = '' item['autodata_Make_id'] = '' item['autodata_model'] = '' item['autodata_model_id'] = '' item['autodata_Spec'] = '' item['autodata_Spec_id'] = '' item['autodata_transmission'] = '' item['autodata_transmission_id'] = '' item['autodata_bodystyle'] = '' item['autodata_bodystyle_id'] = '' sel = response.xpath('//div[@class="pull-left text-left"]') for s in sel: key = ''.join(s.xpath("i//text()").extract()).strip() value = ''.join( s.xpath("strong[@class='block']//text()").extract()).strip() print("@@@@@@@@", value) if key == "Location": item['City'] = value elif key == "Model Year": item['Year'] = value elif key == "Car Driven": item["mileage"] = value elif key == "Transmission:": item["transmission"] = value elif key == "Fuel Type:": item["fuel_type"] = value elif key == "Number of Doors": item["Doors"] = value.replace('Door', '').strip() elif key == "Number of Cylinders": item['cylinders'] = value elif key == "Body Style:": item['bodystyle'] = value elif key == "Exterior Color": item['colour_exterior'] = value print('########', item['Car_Name']) item2['src'] = "bahrain.yallamotor.com" item2['name'] = "yallamotor" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item), sort_keys=True).encode('utf-8')).hexdigest() item['meta'] = dict(item2) item['Car_URL'] = response.url item['Source'] = item2['src'] yield item pass
def parse_data(self, response): item2 = MetaItem() item1 = AutodataItem() item1["Last_Code_Update_Date"] = "" item1["Scrapping_Date"] = "" item1["Country"] = "" item1["City"] = "" item1["Seller_Type"] = "" item1["Seller_Name"] = "" item1["Car_URL"] = "" item1["Car_Name"] = "" item1["Year"] = "" item1["Make"] = "" item1["model"] = "" item1["Spec"] = "" item1["Doors"] = "" item1["transmission"] = "" item1["trim"] = "" item1["bodystyle"] = "" item1["other_specs_gearbox"] = "" item1["other_specs_seats"] = "" item1["other_specs_engine_size"] = "" item1["other_specs_horse_power"] = "" item1["colour_exterior"] = "" item1["colour_interior"] = "" item1["fuel_type"] = "" item1["import_yes_no_also_referred_to_as_GCC_spec"] = "" item1["mileage"] = "" item1["condition"] = "" item1["warranty_untill_when"] = "" item1['service_contract_untill_when'] = '' item1['Price_Currency'] = '' item1['asking_price_inc_VAT'] = '' item1['asking_price_ex_VAT'] = '' item1['warranty'] = '' item1['service_contract'] = '' item1['vat'] = 'yes' item1['mileage_unit'] = '' item1['engine_unit'] = '' item1['autodata_Make'] = '' item1['autodata_Make_id'] = '' item1['autodata_model'] = '' item1['autodata_model_id'] = '' item1['autodata_Spec'] = '' item1['autodata_Spec_id'] = '' item1['autodata_transmission'] = '' item1['autodata_transmission_id'] = '' item1['autodata_bodystyle'] = '' item1['autodata_bodystyle_id'] = '' item1['wheel_size'] = '' item1['top_speed_kph'] = '' item1['cylinders'] = '' item1['acceleration'] = '' item1['torque_Nm'] = '' sel = Selector(response) details1 = sel.xpath('//div[@class = "title module align-center"]') details2 = sel.xpath( '//div[@class="overview-data module overview-data-standard"]/div/div' ) details3 = sel.xpath('//div[@class="address"]') details4 = sel.xpath( '//div[@class="price module u-hidden-sm-only"]/div[@class="price-now"]' ) item1['Car_URL'] = response.url item1['Year'] = ''.join( details1.xpath( 'h3/span[@class= "year"]//text()').extract()).strip() item1['Make'] = ''.join( details1.xpath( 'h3/span[@class= "make"]//text()').extract()).strip() item1['model'] = ''.join( details1.xpath( 'h3/span[@class= "model"]//text()').extract()).strip() item1['Spec'] = ''.join( details1.xpath( 'h3/span[@class= "variant"]//text()').extract()).strip() item1['Car_Name'] = item1['Make'] + ' ' + item1['model'] + ' ' + item1[ 'Spec'] item1['mileage'] = (''.join( details2.xpath( './/div[@class="cell mileage"]/span[@class="value mileage"]//text()' ).extract()).strip()).split(' ')[0] item1['mileage_unit'] = (''.join( details2.xpath( './/div[@class="cell mileage"]/span[@class="value mileage"]//text()' ).extract()).strip()).split(' ')[1] item1['fuel_type'] = ''.join( details2.xpath( './/div[@class="cell fuel-type"]/span[@class="value fuel-type"]//text()' ).extract()).strip() item1['Doors'] = ''.join( details2.xpath( './/div[@class="cell doors"]/span[@class="value doors"]//text()' ).extract()).strip() item1['other_specs_engine_size'] = (''.join( details2.xpath( './/div[@class="cell engine-size"]/span[@class="value engine-size"]//text()' ).extract()).strip()).split(' ')[0] item1['engine_unit'] = (''.join( details2.xpath( './/div[@class="cell engine-size"]/span[@class="value engine-size"]//text()' ).extract()).strip()).split(' ')[1] item1['trim'] = ''.join( details2.xpath( './/div[@class="cell interior-colour"]/span[@class="value interior-colour"]//text()' ).extract()).strip() item1['transmission'] = ''.join( details2.xpath( './/div[@class="cell transmission"]/span[@class="value transmission"]//text()' ).extract()).strip() item1['bodystyle'] = ''.join( details2.xpath( './/div[@class="cell bodystyle"]/span[@class="value bodystyle"]//text()' ).extract()).strip() item1['colour_exterior'] = ''.join( details2.xpath( './/div[@class="cell exterior-colour"]/span[@class="value exterior-colour"]//text()' ).extract()).strip() item1['City'] = ''.join( details3.xpath( 'span[@class="address-city"]//text()').extract()).strip() item1['Country'] = 'Saudi Arabia' item1['Price_Currency'] = (''.join( details4.xpath('span[@class="value"]//text()').extract()).strip() )[:3] item1['asking_price_inc_VAT'] = (''.join( details4.xpath('span[@class="value"]//text()').extract()).strip() )[3:] item2['src'] = "chevrolet.uma.com.sa" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "spider" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item1), sort_keys=True).encode('utf-8')).hexdigest() item1['meta'] = dict(item2) item1['Last_Code_Update_Date'] = 'June 6, 2019' item1['Scrapping_Date'] = datetime.today().strftime('%Y-%m-%d') item1['Seller_Name'] = 'Universal Motors Agencies' item1['Source'] = item2['src'] yield item1
def parse_data(self, response): item = AutodataItem() item2 = MetaItem() item["Last_Code_Update_Date"] = "" item["Scrapping_Date"] = "" item["Country"] = "" item["City"] = "" item["Seller_Type"] = "" item["Seller_Name"] = "" item["Car_URL"] = "" item["Car_Name"] = "" item["Year"] = "" item["Make"] = "" item["model"] = "" item["Spec"] = "" item["Doors"] = "" item["transmission"] = "" item["trim"] = "" item["bodystyle"] = "" item["other_specs_gearbox"] = "" item["other_specs_seats"] = "" item["other_specs_engine_size"] = "" item["other_specs_horse_power"] = "" item["colour_exterior"] = "" item["colour_interior"] = "" item["fuel_type"] = "" item["import_yes_no_also_referred_to_as_GCC_spec"] = "" item["mileage"] = "" item["condition"] = "" item["warranty_untill_when"] = "" item['service_contract_untill_when'] = '' item['Price_Currency'] = '' item['asking_price_inc_VAT'] = '' item['asking_price_ex_VAT'] = '' item['warranty'] = '' item['service_contract'] = '' item['vat'] = 'yes' item['mileage_unit'] = '' item['engine_unit'] = '' item['autodata_Make'] = '' item['autodata_Make_id'] = '' item['autodata_model'] = '' item['autodata_model_id'] = '' item['autodata_Spec'] = '' item['autodata_Spec_id'] = '' item['autodata_transmission'] = '' item['autodata_transmission_id'] = '' item['autodata_bodystyle'] = '' item['autodata_bodystyle_id'] = '' item['wheel_size'] = '' item['top_speed_kph'] = '' item['cylinders'] = '' item['acceleration'] = '' item['torque_Nm'] = '' item["Last_Code_Update_Date"] = "Wednesday,June 19,2019" item["Scrapping_Date"] = datetime.today().strftime('%A, %B %d, %Y') item["Country"] = "UAE" item["City"] = "Dubai" item["Seller_Type"] = "MarketPlace" item["Seller_Name"] = "111 Used Cars" item["Car_URL"] = response.url item2['src'] = "dubizzle.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "dubizzle_spider" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item), sort_keys=True).encode('utf-8')).hexdigest() item['meta'] = dict(item2) item['Source'] = item2['src'] item["Last_Code_Update_Date"] = "Friday,June 21,2019" item["Scrapping_Date"] = datetime.today().strftime('%A, %B %d, %Y') item["Country"] = "UAE" item["City"] = "" item["Seller_Type"] = "Marketplaces" item["Seller_Name"] = "Dubizzle" item["Car_URL"] = response.url url = item["Car_URL"] m = url.split('used-cars/')[-1] item['asking_price_inc_VAT'] = response.xpath( "//span[contains(@id,'actualprice')]/text()").get() item['Price_Currency'] = 'AED' arr = response.xpath( '//div[@id="listing-details-list"]//li/strong/text()').extract() label = response.xpath( '//div[@id="listing-details-list"]//li/span/text()').extract() for lab in range(len(label)): label[lab] = label[lab].strip() label = list(filter(None, label)) for i in range(len(arr)): if 'Year' in label[i]: item["Year"] = arr[i].strip() elif 'Kilometers' in label[i]: item["mileage"] = arr[i].strip() item['mileage_unit'] = 'KM' elif 'Color' in label[i]: item['colour_exterior'] = arr[i].strip() elif 'Doors' in label[i]: item['Doors'] = arr[i].split()[0].replace('+', '').strip() elif 'Warranty' in label[i]: item['warranty'] = arr[i].strip() elif 'Specs' in label[i]: item["Spec"] = arr[i].strip() elif 'Transmission' in label[i]: item['transmission'] = arr[i].replace('Transmission', '').strip() elif 'Body Type' in label[i]: item['bodystyle'] = arr[i].strip() elif 'Fuel Type' in label[i]: item["fuel_type"] = arr[i].strip() elif 'Trim' in label[i]: item['trim'] = arr[11].strip() elif 'Cylinders' in label[i]: if 'Unknown' not in arr[i]: item['cylinders'] = arr[i].strip() elif 'Make' in label[i]: item['Make'] = arr[i].strip() item['Make'] = remove_non_ascii(item['Make']) elif 'Model' in label[i]: item['model'] = arr[i].strip() item['Make'] = remove_non_ascii(item['model']) elif 'Horsepower' in label[i]: if 'unknown' not in arr[i].strip().lower(): item["other_specs_horse_power"] = arr[i].split( 'HP')[0].strip() if item['Make'] != '': item["Car_Name"] = item["Make"] + ' ' + item["model"] if item['Car_Name'] != '': yield item
def parse_dir_contents(self, response): item = AutodataItem() item2 = MetaItem() item["Last_Code_Update_Date"] = "" item["Scrapping_Date"] = "" item["Country"] = "Bahrain" item["City"] = "Sitra" item["Seller_Type"] = "" item["Seller_Name"] = "" item["Car_URL"] = "" item["Car_Name"] = "" item["Year"] = "" item["Make"] = "" item["model"] = "" item["Spec"] = "" item["Doors"] = "" item["transmission"] = "" item["trim"] = "" item["bodystyle"] = "" item["other_specs_gearbox"] = "" item["other_specs_seats"] = "" item["other_specs_engine_size"] = "" item["other_specs_horse_power"] = "" item["colour_exterior"] = "" item["colour_interior"] = "" item["fuel_type"] = "" item["import_yes_no_also_referred_to_as_GCC_spec"] = "" item["mileage"] = "" item["condition"] = "" item["warranty_untill_when"] = "" item['service_contract_untill_when'] = '' item['Price_Currency'] = '' item['asking_price_inc_VAT'] = '' item['asking_price_ex_VAT'] = '' item['warranty'] = '' item['service_contract'] = '' item['vat'] = '' item['mileage_unit'] = '' item['engine_unit'] = 'l' item['Last_Code_Update_Date'] = 'Thursday, June 07, 2019' item['Scrapping_Date'] = datetime.today().strftime('%A, %B %d, %Y') item['autodata_Make'] = '' item['autodata_Make_id'] = '' item['autodata_model'] = '' item['autodata_model_id'] = '' item['autodata_Spec'] = '' item['autodata_Spec_id'] = '' item['autodata_transmission'] = '' item['autodata_transmission_id'] = '' item['autodata_bodystyle'] = '' item['autodata_bodystyle_id'] = '' item['wheel_size'] = '' item['top_speed_kph'] = '' item['cylinders'] = '' item['acceleration'] = '' item['torque_Nm'] = '' item2['src'] = "behbehaniusedcars.com" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "behbaniused" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item), sort_keys=True).encode('utf-8')).hexdigest() item['meta'] = dict(item2) item['Car_URL'] = response.url item['Source'] = item2['src'] # getting make #item['Car_Name'] = response.xpath("//div[contains(@class, 'col_8 content car-detail')]/h2/text()").extract()[0].strip() item['Car_Name'] = ''.join( response.xpath("//div[contains(@id,'listings')]/h2/text()"). extract()).replace('(Approved)', '').replace('“', '').replace( '”', '').replace("Approved", '').strip() #item['Car_Name'] = re.sub(r'[^a-zA-Z0-9./]', r'', item['Car_Name']) item['Year'] = response.xpath( "//ul[contains(@class,'specs')]/li/text()").extract()[1].strip() item['Make'] = ''.join( response.xpath('//p[@class="showroom"]/text()').extract()).split( 'Showroom')[0].strip() if 'used car' in item['Make'].lower(): item['Make'] = ''.join( response.xpath("//div[contains(@id,'listings')]/h2/text()"). extract()).replace(item['Year'], '').strip().split(' ')[0] if 'alfa romeo' in item['Car_Name'].lower(): item['Make'] = 'Alfa Romeo' if 'jetta' in item['Car_Name'].lower(): item['Make'] = 'Volkswagen' item['Car_Name'] = item['Make'] + ' ' + item['Car_Name'] #item['model'] = ''.join(response.xpath("//div[contains(@id,'listings')]/h2/text()").extract()).strip() #item['Spec'] = ''.join(response.xpath("//div[contains(@id,'listings')]/h2/text()").extract()).strip() #item['Doors'] = item['transmission'] = response.xpath( "//ul[contains(@class,'specs')]/li/text()").extract()[3].strip() #item['trim'] = item['cylinders'] = response.xpath( "//ul[contains(@class,'specs')]/li/text()").extract()[6].strip( ).split(' ')[0] item['bodystyle'] = response.xpath( "//ul[contains(@class,'specs')]/li/text()").extract()[7].strip() #item['other_specs_gearbox'] = #item['other_specs_seats'] = item['other_specs_engine_size'] = response.xpath( "//ul[contains(@class,'specs')]/li/text()").extract()[4].strip( ).replace('L', '').replace('TC', '').replace('SC', '').replace('T', '') if len(item['other_specs_engine_size']) > 3: item['engine_unit'] = 'cc' #item['other_specs_engine_size'] = re.search(r'\d+', item['other_specs_engine_size']).group() #item['other_specs_horse_power'] = #item['colour_exterior'] = #item['fuel_type'] = item['mileage'] = response.xpath( "//ul[contains(@class,'specs')]/li/text()").extract()[2].strip() item['mileage_unit'] = 'km' price = response.xpath( "//div[contains(@id,'listings')]/h3/text()").extract()[0] item['asking_price_inc_VAT'] = ''.join(re.findall(r'\d+', price)) item['Price_Currency'] = 'BD' #item['Country'] = #item['City'] = item['Seller_Name'] = 'Behbehani Brothers' lis = response.xpath('//ul[@id="menu-footer-brands"]/li') for li in lis: make = ''.join(li.xpath('a/text()').extract()).strip() if make in item['Car_Name']: item['Make'] = make print("############", len(lis)) ## if 'volkswagen' in ''.join(response.xpath('//p[@class="showroom"]/text()').extract()).lower(): ## item['Make'] = 'Volkswagen' ## item['Car_Name'] = item['Make'] + ' ' + item['Car_Name'] yield item
def parse_data(self, response): item2 = MetaItem() item1 = AutodataItem() sel = Selector(response) item1["Last_Code_Update_Date"] = "" item1["Scrapping_Date"] = "" item1["Country"] = "Oman" item1["City"] = "" item1["Seller_Type"] = "Market Places" item1["Seller_Name"] = "" item1["Car_URL"] = response.url item1["Car_Name"] = "" item1["Year"] = "" item1["Make"] = "" item1["model"] = "" item1["Spec"] = "" item1["Doors"] = "" item1["transmission"] = "" item1["trim"] = "" item1["bodystyle"] = "" item1["other_specs_gearbox"] = "" item1["other_specs_seats"] = "" item1["other_specs_engine_size"] = "" item1["other_specs_horse_power"] = "" item1["colour_exterior"] = "" item1["colour_interior"] = "" item1["fuel_type"] = "" item1["import_yes_no_also_referred_to_as_GCC_spec"] = "" item1["mileage"] = "" item1["condition"] = "" item1["warranty_untill_when"] = "" item1['service_contract_untill_when'] = '' item1['Price_Currency'] = '' item1['asking_price_inc_VAT'] = '' item1['asking_price_ex_VAT'] = '' item1['warranty'] = '' item1['service_contract'] = '' item1['vat'] = 'yes' item1['mileage_unit'] = '' item1['engine_unit'] = '' item1['autodata_Make'] = '' item1['autodata_Make_id'] = '' item1['autodata_model'] = '' item1['autodata_model_id'] = '' item1['autodata_Spec'] = '' item1['autodata_Spec_id'] = '' item1['autodata_transmission'] = '' item1['autodata_transmission_id'] = '' item1['autodata_bodystyle'] = '' item1['autodata_bodystyle_id'] = '' item2['src'] = "olx.com.om" item2['ts'] = datetime.utcnow().isoformat() item2['name'] = "olx_om" item2['url'] = response.url item2['uid'] = str(uuid.uuid4()) item2['cs'] = hashlib.md5( json.dumps(dict(item1), sort_keys=True).encode('utf-8')).hexdigest() item1['meta'] = dict(item2) item1['Source'] = item2['src'] item1['Last_Code_Update_Date'] = 'Tuesday, June 18, 2019' item1['Scrapping_Date'] = datetime.today().strftime('%A, %B %d, %Y') #det = ''.join(sel.xpath('//div[@class="lang-selector small"]/ul/li[@class="inlblk"]/a[@class="x-normal"]/@href').extract()).strip() dets = sel.xpath('//table[@class="item"]//tr') for det in dets: key = ''.join(det.xpath('th/text()').extract()).strip() value = ''.join(det.xpath('td//text()').extract()).strip() #print(key,value) if key == 'Model': item1["model"] = value elif key == 'Transmission Type': item1["transmission"] = value elif key == 'Year': item1["Year"] = value elif key == 'Color': item1["colour_exterior"] = value elif key == 'Body Type': item1["bodystyle"] = value elif key == 'Kilometers': item1["mileage"] = value.split(' ')[0] elif key == 'Warranty': item1['warranty'] = value if value == 'Does Not Apply': item1['warranty'] = '' item1['asking_price_inc_VAT'] = ''.join( sel.xpath('//div[@class="pricelabel tcenter"]//text()').extract() ).strip().split(' ')[0] item1['Price_Currency'] = 'OMR' item1["City"] = ''.join( sel.xpath('//strong[@class="c2b small"]//text()').extract()).strip( ).split(' ')[-1] item1['Make'] = ''.join( sel.xpath('//ul[@class="clearfix"]/li[4]//span/text()').extract() ).strip().split(' ')[0][1:] item1['Car_Name'] = item1['Make'] + ' ' + item1['model'] yield item1