def parse(self, response): print response.url for line in response.xpath(self.script_xpath).extract_first().split('\n'): if line.strip().startswith('model'): immo_json = line.strip() immo_json = json.loads(immo_json[7:-1]) for result in immo_json["results"]: item = ImmoscoutItem() item['immo_id'] = result['id'] item['url'] = response.urljoin("/expose/" + str(result['id'])) item['title'] = result['title'] item['address'] = result['address'] item['city'] = result['city'] item['zip_code'] = result['zip'] item['district'] = result['district'] for attr in result['attributes']: if attr['title'] == "Kaltmiete": item['rent'] = attr['value'][:-2] # remove units if attr['title'] == u"Wohnfläche": item['sqm'] = attr['value'][:-3] # remove units if attr['title'] == "Zimmer": item['rooms'] = attr['value'] try: item['contact_name'] = result['contactName'] except: item['contact_name'] = None try: item['media_count'] = result['mediaCount'] except: item['media_count'] = 0 try: item['lat'] = result['latitude'] item['lng'] = result['longitude'] except: item['lat'] = None item['lng'] = None yield item next_page = response.xpath(self.next_xpath).extract()[-1] if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)
def parse(self, response): print(response.url) for line in response.xpath( self.script_xpath).extract_first().split('\n'): if line.strip().startswith('resultListModel'): immo_json = line.strip() try: immo_json = json.loads(immo_json[17:-1]) #TODO: On result pages with just a single result resultlistEntry is not a list, but a dictionary. #TODO: So extracting data will fail. numberOfHits = int( immo_json["searchResponseModel"] ["resultlist.resultlist"]["resultlistEntries"][0] ["@numberOfHits"]) print("Number of hits: %i" % (numberOfHits, )) for result in immo_json["searchResponseModel"][ "resultlist.resultlist"]["resultlistEntries"][0][ "resultlistEntry"]: item = ImmoscoutItem() data = result["resultlist.realEstate"] # print(data) item['immo_id'] = data['@id'] item['createdAtDate'] = result['@creation'] item['modifiedAtDate'] = result['@modification'] item['publishedAtDate'] = result['@publishDate'] item['hasNewFlag'] = result['hasNewFlag'] item['url'] = response.urljoin("/expose/" + str(data['@id'])) item['title'] = data['title'] address = data['address'] try: item['address'] = address[ 'street'] + " " + address['houseNumber'] except: item['address'] = None if 'newHomeBuilder' in result: item['newHomeBuilder'] = result['newHomeBuilder'] else: item['newHomeBuilder'] = None if 'floorplan' in data: item['floorplan'] = data['floorplan'] else: item['floorplan'] = None item['city'] = address['city'] item['zip_code'] = address['postcode'] item['district'] = address['quarter'] item["rent"] = data["price"]["value"] item["livingSpace"] = data[ "livingSpace"] # Wohnflaeche item["rooms"] = data["numberOfRooms"] if "calculatedPrice" in data: item["extra_costs"] = ( data["calculatedPrice"]["value"] - data["price"]["value"]) if "builtInKitchen" in data: item["kitchen"] = data["builtInKitchen"] if "balcony" in data: item["balcony"] = data["balcony"] if "garden" in data: item["garden"] = data["garden"] if "privateOffer" in data: item["private"] = data["privateOffer"] if "plotArea" in data: item["plotArea"] = data["plotArea"] if "cellar" in data: item["cellar"] = data["cellar"] try: contact = data['contactDetails'] item['contact_name'] = contact[ 'firstname'] + " " + contact["lastname"] except: item['contact_name'] = None try: item['media_count'] = len( data['galleryAttachments']['attachment']) except: item['media_count'] = 0 try: item['lat'] = address['wgs84Coordinate'][ 'latitude'] item['lng'] = address['wgs84Coordinate'][ 'longitude'] except Exception as e: # print(e) item['lat'] = None item['lng'] = None # yield item yield Request(item['url'], callback=self.parse_expose, meta={'thisItem': item}) except Exception as e: print("There was a general error: %s" % (e, )) #print("!!!! GENERAL ERROR !!!!" next_page_list = response.xpath(self.next_xpath).extract() if next_page_list: next_page = next_page_list[-1] print("Scraping next page", next_page) if next_page: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)
def parse(self, response): print(response.url) for line in response.xpath( self.script_xpath).extract_first().split('\n'): if line.strip().startswith('resultListModel'): immo_json = line.strip() immo_json = json.loads(immo_json[17:-1]) #TODO: On result pages with just a single result resultlistEntry is not a list, but a dictionary. #TODO: So extracting data will fail. for result in immo_json["searchResponseModel"][ "resultlist.resultlist"]["resultlistEntries"][0][ "resultlistEntry"]: item = ImmoscoutItem() # print(data) data = result["resultlist.realEstate"] item['immo_id'] = data['@id'] item['url'] = response.urljoin("/expose/" + str(data['@id'])) item['title'] = data['title'] address = data['address'] try: item['address'] = address['street'] + " " + address[ 'houseNumber'] except: item['address'] = None item['city'] = address['city'] item['zip_code'] = address['postcode'] item['district'] = address['quarter'] try: item['lat'] = address['wgs84Coordinate']['latitude'] item['lng'] = address['wgs84Coordinate']['longitude'] except Exception as e: # print(e) item['lat'] = None item['lng'] = None item["rent"] = data["price"]["value"] item["livingSpace"] = data["livingSpace"] item["rooms"] = data["numberOfRooms"] item["brokerage"] = data["courtage"]["hasCourtage"] if "calculatedPrice" in data: item["extra_costs"] = ( data["calculatedPrice"]["value"] - data["price"]["value"]) if "builtInKitchen" in data: item["kitchen"] = data["builtInKitchen"] if "balcony" in data: item["balcony"] = data["balcony"] if "garden" in data: item["garden"] = data["garden"] if "privateOffer" in data: item["private"] = data["privateOffer"] if "plotArea" in data: item["area"] = data["plotArea"] if "cellar" in data: item["cellar"] = data["cellar"] if "guestToilet" in data: item["guestToilet"] = data["guestToilet"] if "@publishDate" in result: item["publishDate"] = result["@publishDate"] try: contact = data['contactDetails'] item['contact_name'] = contact[ 'firstname'] + " " + contact["lastname"] except: item['contact_name'] = None try: item['media_count'] = len( data['galleryAttachments']['attachment']) except: item['media_count'] = 0 yield item next_page_list = response.xpath(self.next_xpath).extract() if next_page_list: next_page = next_page_list[-1] print("Scraping next page", next_page) if next_page: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)
def parse(self, response): #print(response.url) for line in response.xpath( self.script_xpath).extract_first().split('\n'): if line.strip().startswith('resultListModel'): immo_json = line.strip() immo_json = json.loads( immo_json[17:-1] ) # everything element including #18..(last-1) #TODO: On result pages with just a single result resultlistEntry is not a list, but a dictionary. #TODO: So extracting data will fail. for result in immo_json["searchResponseModel"][ "resultlist.resultlist"]["resultlistEntries"][0][ "resultlistEntry"]: item = ImmoscoutItem() #define new field if needed here data = result["resultlist.realEstate"] #General Information item['immo_id'] = data['@id'] item['title'] = data['title'] item['url'] = response.urljoin("/expose/" + str(data['@id'])) item['retype'] = data['@xsi.type'] #Adress address = data['address'] try: item['address'] = address['city'] + " " + address[ 'street'] + " " + address['houseNumber'] except: item['address'] = "" item['city'] = address['city'] try: item['street'] = address['street'] except: item['street'] = "" try: item['housenumber'] = address['houseNumber'] except: item['housenumber'] = "" if "preciseHouseNumber" in data: item['precisehousenumber'] = address[ 'preciseHouseNumber'] else: item['precisehousenumber'] = "" item['zip_code'] = address['postcode'] item['district'] = address['quarter'] try: item['lat'] = address['wgs84Coordinate']['latitude'] item['lng'] = address['wgs84Coordinate']['longitude'] except Exception as e: # print(e) item['lat'] = "" item['lng'] = "" #Additions if "balcony" in data: item["balcony"] = data["balcony"] else: item["balcony"] = "" if "builtInKitchen" in data: item["kitchen"] = data["builtInKitchen"] else: item["kitchen"] = "" if "cellar" in data: item["cellar"] = data["cellar"] else: item["cellar"] = "" if "companywidecustomerid" in data: item['companywidecustomerid'] = address[ 'companyWideCustomerId'] else: item["companywidecustomerid"] = "" #contactDetails contact = data['contactDetails'] try: item['contcompany'] = contact['company'] except: item['contcompany'] = "" try: item['contname'] = contact[ 'firstname'] + " " + contact["lastname"] except: item['contname'] = "" if "contfirstname" in data: item['contfirstname'] = contact['firstname'] else: item['contfirstname'] = "" if "contlastname" in data: item['contlastname'] = contact['lastname'] else: item['contlastname'] = "" if "contphonenumber" in data: item['contphonenumber'] = contact['phoneNumber'] else: item['contphonenumber'] = "" item['contsalutation'] = contact['salutation'] #courtage #courtage = data['courtage'] #item['hascourtage'] = courtage['hasCourtage'] item['hascourtage'] = '' #Additions2 item['floorplan'] = data['floorplan'] if "garden" in data: item["garden"] = data["garden"] else: item["garden"] = "" if "guestToilet" in data: item["guesttoilet"] = data["guestToilet"] else: item["guesttoilet"] = "" if "isBarrierFree" in data: item["isbarrierfree"] = data["isBarrierFree"] else: item["isbarrierfree"] = "" if "lift" in data: item["lift"] = data["lift"] else: item["lift"] = "" item["listingtype"] = data["listingType"] item["livingspace"] = data["livingSpace"] item["numberofrooms"] = data["numberOfRooms"] #price price = data["price"] item["currency"] = price["currency"] item["marketingtype"] = price["marketingType"] item["priceintervaltype"] = price["priceIntervalType"] item["value"] = price["value"] #Additions3 if "privateOffer" in data: item["privateoffer"] = data["privateOffer"] else: item["privateoffer"] = "" try: item["realtorcompanyname"] = data["realtorCompanyName"] except: item["realtorcompanyname"] = "" if "realtorlogo" in data: item["realtorlogo"] = data["realtorLogo"] else: item["realtorlogo"] = "" item["spotlightlisting"] = data["spotlightListing"] item["streamingvideo"] = data["streamingVideo"] #titlePicture try: titlePicture = data["titlePicture"] except: titlePicture = "" try: item["creation"] = titlePicture["@creation"] except: item["creation"] = "" try: item['media_count'] = len( data['galleryAttachments']['attachment']) except: item['media_count'] = 0 yield item next_page_list = response.xpath(self.next_xpath).extract() if next_page_list: next_page = next_page_list[-1] print("Scraping next page", next_page) if next_page: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)
def parse(self, response): print(response.url) for line in response.xpath( self.script_xpath).extract_first().split('\n'): if line.strip().startswith('resultListModel'): immo_json = line.strip() immo_json = json.loads(immo_json[17:-1]) for result in immo_json["searchResponseModel"][ "resultlist.resultlist"]["resultlistEntries"][0][ "resultlistEntry"]: item = ImmoscoutItem() data = result["resultlist.realEstate"] item['immo_id'] = data['@id'] item['url'] = response.urljoin("/expose/" + str(data['@id'])) item['title'] = data['title'] address = data['address'] try: item['address'] = address['street'] + " " + address[ 'houseNumber'] except: item['address'] = None item['city'] = address['city'] item['zip_code'] = address['postcode'] item['district'] = address['quarter'] for attr in result['attributes'][0]['attribute']: if attr['label'] == "Kaltmiete": item['rent'] = attr['value'][:-2] # remove units if attr['label'] == u"Wohnfläche": item['sqm'] = attr['value'][:-3] # remove units if attr['label'] == "Zimmer": item['rooms'] = attr['value'] try: contact = data['contactDetails'] item['contact_name'] = contact[ 'firstname'] + " " + contact["lastname"] except: item['contact_name'] = None try: item['media_count'] = len( data['galleryAttachments']['attachment']) except: item['media_count'] = 0 try: item['lat'] = address['wgs84Coordinate']['latitude'] item['lng'] = address['wgs84Coordinate']['longitude'] except: item['lat'] = None item['lng'] = None yield item next_page = response.xpath(self.next_xpath).extract()[-1] print("Scraping next page", next_page) if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)