def parse_stores(self, response): location = re.findall(r"[;?]ll=[^(&)]+", response.body_as_unicode()) addr_full = response.xpath('//div[@class="cq-shop-info"]/p[1]/text()|//div[@id="cq-shop-info"]/p[1]/text()|//div[@id="cq-full-width"]/div[@class="cq-content"]/div/p[@class="shop-address"]/text()').extract() if len(addr_full)>2: city = addr_full[1] postcode = addr_full[2] else: city = addr_full[1].split(',')[0] postcode = addr_full[1].split(',')[1] if(len(location)>0): lat = float( location[0][4:].split(",")[0]) lon = float( location[0][4:].split(",")[1]) else: lat ='' lon='' properties = { 'addr_full': addr_full[0], 'phone': response.xpath('normalize-space(//div[@id="cq-shop-info"]/p[@class="impinfo"]/text())').extract_first(), 'city':city, 'state': '', 'postcode': postcode, 'ref': response.url, 'website': response.url, 'lat': lat, 'lon': lon, } # hours = self.parse_hours(response.xpath('//ul[@class="cleanList srHours srSection"]/li')) # if hours: # properties['opening_hours'] = hours yield inputoutput(**properties)
def parse(self, response): data = json.loads(response.body_as_unicode()) for key, value in data.items(): if 'AddressLine' in value: addr_full = value['AddressLine'].split(',') address = ", ".join(addr_full[:len(addr_full) - 1]) city = addr_full[len(addr_full) - 1] else: address = "" city = "" if 'postcode' in value: postcode = value['postcode'] else: postcode = "" properties = { 'ref': key, 'name': value['branch_name'], 'addr_full': address, 'city': city, 'country': 'United Kingdom', 'postcode': postcode, 'lat': value['Latitude'], 'lon': value['Longitude'], 'phone': value['telephone'], } opening_hours = self.store_hours(value) if opening_hours: properties['opening_hours'] = opening_hours yield inputoutput(**properties)
def parse_store(self, response): json_data = response.xpath( '//head/script[@type="application/ld+json"]/text()')[1].extract() json_data = json_data.replace( '// if the location file does not have the hours separated into open/close for each day, remove the below section', '') data = json.loads(json_data) properties = { 'phone': data['telephone'], 'website': response.xpath('//head/link[@rel="canonical"]/@href')[0].extract(), 'ref': data['@id'], 'opening_hours': self.store_hours(data['openingHoursSpecification']), 'lon': float(data['geo']['longitude']), 'lat': float(data['geo']['latitude']), } address = self.address(data['address']) if address: properties.update(address) yield inputoutput(**properties)
def parse(self, response): stores = json.loads(response.body_as_unicode()) for store in stores: props = {} store_info = store.get('store_info', '') props['ref'] = store_info['corporate_id'] props['lat'] = store_info['latitude'] props['lon'] = store_info['longitude'] props['state'] = store_info['region'] props['city'] = store_info['locality'] props['opening_hours'] = self.parse_hours( store_info.get('store_hours', '')) props['addr_full'] = ', '.join([ store_info['address'], store_info.get('address_extended', '') ]) sieve_out = ['website', 'phone', 'postcode', 'country'] props.update({key: store_info[key] for key in sieve_out}) yield inputoutput(**props)
def parse(self, response): results = json.loads(response.body_as_unicode()) for data in results['results']: ref = data['id_suc'] name = "Coto " + data['desc_suc'] street = data['direccion'] phone = data['telefono'] lat = data['latitud'] lon = data['longitud'] mon_thu = "Mo-Th " + data['hor_lu_a_ju'] fri = "Fr " + data['hor_vi'] sat = "Sa " + data['hor_sa'] sun = "Su " + data['hor_do'] if data['hor_do'] != "Cerrado" else "Su off" opening_hours = "{}; {}; {}; {}".format( mon_thu, fri, sat, sun).replace(' a ', '-') yield inputoutput( ref=ref, lat=lat, lon=lon, name=name, street=street, country="Argentina", phone=phone, addr_full=street, opening_hours=opening_hours )
def parse(self, response): data = response.xpath('.//div[@class="location-listing-item row"]') for store in data: ref = self.parse_Ref(store) properties = { 'ref': ref, 'addr_full': store.xpath("//span[@class='address']//text()").extract_first( ).strip(), 'city': store.xpath( "//span[@class='city']//text()").extract_first().strip(), 'state': store.xpath( "//span[@class='state']//text()").extract_first().strip(), 'postcode': store.xpath( "//span[@class='zip']//text()").extract_first().strip(), 'phone': store.xpath( "//span[@class='phone']//text()").extract_first().strip(), 'name': store.xpath(".//strong//text()").extract_first().strip(), 'lon': store.xpath("@data-lon").extract_first(), 'lat': store.xpath("@data-lat").extract_first() } yield inputoutput(**properties)
def parse(self, response): data = json.loads(response.body_as_unicode()) for store in data.get('features', []): store_info = store['properties'] properties = { "ref": store_info['id'], 'addr_full': store_info['addressLine1'], 'city': store_info['addressLine3'], 'state': store_info['subDivision'], 'country': store_info['addressLine4'], 'postcode': store_info['postcode'], 'phone': store_info.get('telephone'), 'lon': store['geometry']['coordinates'][0], 'lat': store['geometry']['coordinates'][1], } hours = store_info.get('restauranthours') try: hours = self.store_hours(hours) if hours: properties['opening_hours'] = hours except: self.logger.exception("Couldn't process opening hours: %s", hours) yield inputoutput(**properties)
def parse_detail_product(self, response): product = response.meta.get('product') open_dates = response.xpath('//table[@id="hours-table"]//tr') product['opening_hours'] = self.store_hours( open_dates) if len(open_dates) > 0 else u'24/7' yield inputoutput(**product)
def parse_us(self, response): results = json.loads(response.body_as_unicode()) stores = results['stores'] for store_key in stores: store_data = stores[store_key] properties = { 'phone': store_data['phone'], 'addr_full': store_data['address1'].title(), 'city': store_data['city'].title(), 'state': store_data['stateCode'], 'postcode': store_data['postalCode'], 'lon': float(store_data['longitude']), 'lat': float(store_data['latitude']), 'ref': store_key, } hours = (store_data['storeHours'] if 'storeHours' in store_data else None) opening_hours = None if hours and ("Please call" not in hours): opening_hours = self.store_hours(hours) if opening_hours: properties['opening_hours'] = opening_hours yield inputoutput(**properties)
def parse_store(self, response): # There are newlines in the opening hours, which is bad JSON. We turn # off strict mode so Python's JSON library will parse it. json_content = response.xpath('//script[@type="application/ld+json"]/text()').extract_first() data = json.loads(json_content, strict=False) store_data = data[0] properties = { 'website': store_data['url'], 'name': store_data['name'], 'phone': store_data['address']['telephone'], 'ref': store_data['url'], 'addr_full': store_data['address']['streetAddress'], 'postcode': store_data['address']['postalCode'], 'state': store_data['address']['addressRegion'], 'city': store_data['address']['addressLocality'], 'lon': float(store_data['geo']['longitude']), 'lat': float(store_data['geo']['latitude']), } opening_hours = self.store_hours(store_data['openingHours']) if opening_hours: raw = store_data['openingHours'] formatted = opening_hours yield inputoutput(raw, formatted)
def process_store(self, store): opening_hours, phone = ('', '') data = store.xpath( '//div[@class="col-lg-4"]/div/*[not(self::h2 or self::strong)]//text()' ).extract() normalize_data = [ val for val in [info.strip() for info in data] if val ] final_data = [clean for clean in normalize_data if clean not in SIEVE] city, state_zip = final_data[2].split(',') state, pcode = state_zip.strip().split() if 'Phone Number' in final_data: phone = final_data[final_data.index('Phone Number') + 1] if 'Store Hours' in final_data: opening_hours = self.parse_hours( final_data[final_data.index('Store Hours') + 1:][0]) props = { 'addr_full': final_data[1], 'ref': store.url, 'city': city, 'postcode': pcode, 'state': state, 'website': store.url, 'opening_hours': opening_hours, 'phone': phone, } yield inputoutput(**props)
def parse_links(self, response): hours = response.xpath( '//form[@id="directions-form"]/input[@name="hours"]/@value' ).extract_first() website = response.xpath( '//head/link[@rel="canonical"]/@href').extract_first() link_id = website.split("/")[-2] # properties = { # "addr_full": response.xpath('//form[@id="directions-form"]/input[@name="address"]/@value').extract_first(), # "city": response.xpath('//form[@id="directions-form"]/input[@name="city"]/@value').extract_first(), # "state": response.xpath('//form[@id="directions-form"]/input[@name="state"]/@value').extract_first(), # "postcode": response.xpath('//form[@id="directions-form"]/input[@name="zip"]/@value').extract_first(), # "phone": response.xpath('//form[@id="directions-form"]/input[@name="phone"]/@value').extract_first(), # "website": website, # "ref": link_id, # "opening_hours": self.process_hours(hours[0]), # "lat": float(response.xpath('//form[@id="directions-form"]/input[@name="lat"]/@value').extract_first()), # "lon": float(response.xpath('//form[@id="directions-form"]/input[@name="long"]/@value').extract_first()), # } # yield inputoutput(**properties) raw = hours[0] formatted = self.process_hours(hours[0]) yield inputoutput(raw, formatted)
def parse(self, response): data = json.loads(response.body_as_unicode()) for store in data: # properties = { # "ref": store.get('id'), # "name": store.get('name'), # "addr_full": store.get('address'), # "city": store.get('city'), # "state": store.get('state'), # "postcode": store.get('zip'), # "phone": store.get('telephone'), # } # if store.get('url'): # properties['website'] = 'https://www.whitecastle.com' + store.get('url') # if store.get('latitude'): properties['lat'] = float(store.get('latitude')) # if store.get('longitude'): properties['lon'] = float(store.get('longitude')) if store.get('timetable'): raw = store.get('timetable') formatted = self.store_hours(store.get('timetable')) yield inputoutput(raw,formatted)
def parse_page(self, response): row = response.xpath('//tr') for i in row: storeid = i.xpath('./td[1]/text()').extract_first() name = i.xpath('./td[2]/text()').extract_first() street = i.xpath('./td[3]/text()').extract_first() city = i.xpath('./td[4]/text()').extract_first() state = i.xpath('./td[5]/text()').extract_first() postcode = i.xpath('./td[6]/text()').extract_first() phone = i.xpath('./td[7]/text()').extract_first() lat = i.xpath('./td[8]/text()').extract_first() lon = i.xpath('./td[9]/text()').extract_first() addr_full = "{} {}, {} {}".format(street, city, state, postcode) yield inputoutput( ref=storeid, name=name, street=street, city=city, state=state, postcode=postcode, addr_full=addr_full, phone=phone, lat=lat, lon=lon, )
def parse(self, response): results = json.loads(response.body_as_unicode()) if results: for i in results: # ref = i['storeid'] # name = i['restaurantname'] # street = i['address1'] # city = i['city'] # state = i['statecode'] # postcode = i['zipcode'] # phone = i['phone'] # lon = i['longitude'] # lat = i['latitude'] hours = self.convert_hours(i['businesshours']) # addr_full = "{} {}, {} {}".format(street, city, state, postcode) # yield inputoutput( # ref=ref, # name=name, # street=street, # city=city, # state=state, # postcode=postcode, # addr_full=addr_full, # phone=phone, # lon=lon, # lat=lat, # opening_hours=hours # ) raw = i['businesshours'] formatted = hours yield inputoutput(raw, formatted)
def parse(self, response): for match in response.xpath( "//h2[contains(@class,'font-weight-700 text-uppercase')]/parent::div/parent::div" ): # cityState = match.xpath(".//div[contains(@class,'heading-text el-text')]/div/p/text()").extract_first(); # cityString = cityState.split(",")[0].strip() # stateString = cityState.split(",")[1].strip() # addressString = match.xpath(".//div[contains(@class,'uncode_text_column')]/p[contains(@style,'text-align: center;')][not(.//strong)]/text()").extract_first().strip() # postcodeString = addressString.split(stateString)[1].strip() # addressString = addressString.split(stateString)[0].replace(',','').strip().strip(cityString).strip() # if(match.xpath(".//div[contains(@class,'uncode_text_column')]/p[contains(@style,'text-align: center;')][not (.//strong)]/br/following-sibling::text()").extract_first() is None): # phoneString = "" # else: # phoneString = match.xpath(".//div[contains(@class,'uncode_text_column')]/p[contains(@style,'text-align: center;')][not (.//strong)]/br/following-sibling::text()").extract_first() # phoneString = phoneString.replace(' ','').strip() hoursString = "" unfmthours = '' for hoursMatch in match.xpath( ".//p[contains(@style,'text-align: center;')]/strong//following-sibling::text()" ): unfmthours = unfmthours + ' ' + hoursMatch.extract().replace( '\n', '') hoursString = hoursString + ' ' + self.store_hours( hoursMatch.extract().replace('\n', '')) raw = unfmthours formatted = hoursString yield inputoutput(raw, formatted)
def parse(self, response): data = json.loads(response.body_as_unicode()) for store in data['results']: # properties = { # "ref": store['id'], # "name": store['name'], # "opening_hours": store['hours']['operating'], # "addr_full": store['address'], # "city": store['city'], # "state": store['state'], # "postcode": store['zip'], # "country": store['country'], # "lon": float(store['lon']), # "lat": float(store['lat']), # "phone": store['phone'], # } # "opening_hours": store['hours']['operating'], raw = store['hours']['operating'] formatted = store['hours']['operating'] yield inputoutput(raw,formatted) next_url = data['next'] if next_url is not None: next_url = response.urljoin(next_url) yield scrapy.Request(url=next_url, headers=HEADERS, callback=self.parse)
def parse_store(self, response): json_data = response.xpath('//script[@type="text/javascript"]/text()' ).extract_first().replace('\n', '').replace( '\t', '').split('.push(')[1].rstrip(')') data = json.loads(json_data) geojson_data = response.xpath( '//script[@class="js-store-finder-initial-state"][@type="application/json"]/text()' ).extract_first() geodata = json.loads(geojson_data) # properties = { # 'name': data['seoData']['name'], # 'ref': data['seoData']['name'], # 'addr_full': data['seoData']['address']['streetAddress'], # 'city': data['seoData']['address']['addressLocality'], # 'postcode': data['seoData']['address']['postalCode'], # 'country': data['seoData']['address']['addressCountry'], # 'website': response.request.url, # 'opening_hours': str(data['seoData']['openingHours']).replace('[','').replace(']','').replace("'",''), # 'lat': float(geodata['store']['latlng']['lat']), # 'lon': float(geodata['store']['latlng']['lng']), # } raw = str(data['seoData']['openingHours']) formatted = str(data['seoData']['openingHours']).replace( '[', '').replace(']', '').replace("'", '') yield inputoutput(raw, formatted)
def parse(self, response): data = json.loads(response.body_as_unicode()) stores = data.get('Stores', None) props = {} for store in stores: props['lat'] = store.pop('Latitude', None) props['lon'] = store.pop('Longitude', None) props['ref'] = store.pop('StoreID', None) props['website'] = URL for new_key, old_keys in NORMALIZE_KEYS: props[new_key] = ", ".join([ store.pop(key, '').strip() for key in old_keys if store[key] ]) opening_hours = normalize_time(store.pop('Hours', '')) if opening_hours: props['opening_hours'] = opening_hours props.pop('Hours', None) yield inputoutput(**props)
def parse_location(self, response): unp = {} # Unprocessed properties properties = {} unp['phone'] = response.xpath( '//span[@itemprop="telephone"]/a/text()').extract_first() unp['name'] = response.xpath( '//span[@itemprop="name"]/h2[@class="loc_d_title"]/text()' ).extract_first() unp['ref'] = response.url unp['website'] = response.url addressdiv = response.xpath('//div[@itemprop="address"]')[0] unp['addr_full'] = addressdiv.xpath( './/span[@itemprop="streetAddress"]/text()').extract_first() unp['city'] = addressdiv.xpath( './/span[@itemprop="addressLocality"]/text()').extract_first() unp['state'] = addressdiv.xpath( './/span[@itemprop="addressRegion"]/text()').extract_first() unp['postcode'] = addressdiv.xpath( './/span[@itemprop="postalCode"]/text()').extract_first() hours = response.xpath( '//ul[@class="loc_d_times row"]/li/text()').extract() raw = hours opening_hours = None if hours: opening_hours = self.store_hours(','.join(hours)) if opening_hours: formatted = opening_hours yield inputoutput(raw, formatted) for key in unp: if unp[key]: properties[key] = unp[key].strip()
def parse(self, response): # retrieve JSON data from REST endpoint # items = response.xpath('//text()').extract() # convert data variable from unicode to string # items = str(items) # convert type string representation of list to type list # data = [items] # load list into json object for parsing jsondata = json.loads(response.body_as_unicode()) # iterate items for item in jsondata['d']['results']: # print str(item['Address1']) yield inputoutput( ref=item['EntityID'], lat=float(item['Latitude']), lon=float(item['Longitude']), addr_full=self.parseAddr(item['Address1'], item['Address2']), city=item['Locality'], state=item['AdminDistrict'], postcode=item['PostalCode'], name=item['MallName'], phone=item['Phone'], opening_hours=item['StoreHours'], )
def parse_location(self, response): ref = response.xpath('//a[@class="btn set-as-location"]/@data-loc-id').extract_first() \ or response.request.url properties = { "phone": response.xpath( '//div[@class="module"]/p/a/text()').extract_first(), "ref": ref, "name": response.xpath( '//div[@class="location-details"]/h1/text()').extract_first(), "opening_hours": self.store_hours(response.xpath('//dl[@class="hours"]')[0]), "lon": float( response.xpath('//span[@id="currentlocdistanceid"]/@data-long' ).extract_first()), "lat": float( response.xpath('//span[@id="currentlocdistanceid"]/@data-lat'). extract_first()), } properties.update(self.address(response)) yield inputoutput(**properties)
def parse_stores(self, response): ref = response.meta['id'] json_data = json.loads(response.body_as_unicode()) if 'address1' not in json_data: return properties = { 'addr_full': json_data['address1'], 'phone': json_data['phoneNumber'], 'city': json_data['city'], 'state': json_data['stateCode'], 'postcode': json_data['postalCode'], 'ref': ref, 'website': "http://www.acehardware.com/mystore/index.jsp?store=" + ref, 'lat': float(json_data['latitude']), 'lon': float(json_data['longitude']), } hours = self.parse_hours(json_data['hours']) raw = json_data['hours'] if hours: properties['opening_hours'] = hours formatted = hours yield inputoutput(raw, formatted)
def parse(self, response): json_str = response.body_as_unicode() data = json.loads(json_str)['locations'] for store in data: store_details = store['bing'] (num, street) = store_details['AddressLine'].split(' ', 1) properties = { "phone": store_details['Phone'], "ref": store_details['EntityID'], "name": store['post']['post_title'], "opening_hours": self.store_hours(store_details), "lat": store_details['Latitude'], "lon": store_details['Longitude'], "addr_full": store_details['AddressLine'], "housenumber": num, "street": street, "city": store_details['Locality'], "state": store_details['AdminDistrict'], "postcode": store_details['PostalCode'], "country": store_details['CountryRegion'], "website": store['url'], } yield inputoutput(**properties)
def parse_store(self, response): contacts = response.xpath( '//ul[@class="contact"]/li/span/text()').extract() properties = { 'addr_full': contacts[0], 'city': contacts[1], 'state': contacts[2], 'postcode': contacts[3], 'phone': contacts[4], 'ref': response.url, 'website': response.url, } day_groups = response.xpath( '//ul[@class="hours"]/li[@class="storehours"]/text()').extract() opening_hours = [] for day_group in day_groups: match = re.match(r'(.*): (\d+)-(\d+)', day_group) days, f_hr, t_hr = match.groups() f_hr = int(f_hr) t_hr = int(t_hr) + 12 opening_hours.append('{} {:02d}:00-{:02d}:00'.format( days, f_hr, t_hr)) if opening_hours: properties['opening_hours'] = '; '.join(opening_hours) yield inputoutput(**properties)
def parse_store(self, response): properties = { 'website': response.xpath( '//head/link[@rel="canonical"]/@href').extract_first(), 'ref': str( response.xpath('/html/body/div[1]/div[1]/header/h1/text()'). extract()).strip("['']"), 'opening_hours': re.sub( '\s+', ' ', response.css('#secondary').extract()[0].split('<h5>Hours</h5>') [1].replace('<br>', '').replace('</aside>', '').replace( '\t', ' ').replace('\n', '').replace('\r', ' ')).strip(), # 'lon': float(data['geo']['longitude']), # not lon on page # 'lat': float(data['geo']['latitude']), # not lat on page } address = self.address( response.xpath( '/html/body/div[1]/div[1]/aside/address/text()').extract()) if address: properties.update(address) yield inputoutput(**properties)
def parse(self, response): phoneregex = re.compile('^<a.+>([0-9\-]+)<\/a>$') stores = json.loads(response.body_as_unicode()) for key, value in stores.items(): all_address = value['address'].split(',') len_address = len(all_address) state_zipcode = all_address[len_address - 1] zipcode = re.findall(r"(\d{5})", state_zipcode) addr_full = re.findall(r"^[^(,|.)]+", value['address'])[0] if (len(zipcode) > 0): zipcode = zipcode[0] else: zipcode = '' state = re.findall(r"([A-Z]{2})", state_zipcode) if (len(state) > 0): state = state[0] else: state = '' properties = { 'ref': value['ID'], 'name': value['title'], 'addr_full': addr_full, 'city': value['title'], 'state': state, 'postcode': zipcode, 'lat': value['location']['lat'], 'lon': value['location']['lng'], } if value['phone']: properties['phone'] = value['phone'] yield inputoutput(**properties)
def parse_store(self, response): address = response.xpath( '//div[@class="street"]//text()').extract_first().strip() postalCode, city = self.parse_city( response.xpath( '//div[@class="postal-code-city"]//text()').extract_first()) phone = response.xpath( '//div[@class="field--phone"]//text()').extract_first() if phone: phone = phone.strip() else: phone = "" properties = { 'addr_full': address, 'city': city, 'name': "McDonald's", 'postcode': postalCode, 'phone': phone, 'ref': response.meta['ref'], 'lon': response.meta['lon'], 'lat': response.meta['lat'] } data = re.finditer(r'<span class=\"label\">.*</span>(.*)</li>', response.body_as_unicode()) opening_hours = self.store_hours(data) if opening_hours: properties['opening_hours'] = opening_hours yield inputoutput(**properties)
def parse(self, response): jsonresponse = json.loads(response.body_as_unicode()) for store in jsonresponse: addr_full = store['address'] + ", " + store['city'] + " " + store[ 'state'] + " " + store['zip'] datestring = store['hours'] hour_match = re.findall(r"(\d{1,2}:\d{1,2})", datestring) for hour in hour_match: if hour == "9:00": pass else: raise DifferentHours( "Store added with different hours than 09:00-21:00") properties = { 'name': store['store'], 'addr_full': addr_full, 'street': store['address'], 'city': store['city'], 'state': store['state'], 'postcode': store['zip'], 'country': store['country'], 'phone': store['phone'], 'website': store['permalink'], 'opening_hours': '09:00-21:00', 'ref': store['id'] + " " + store['store'], 'lat': float(store['lat']), 'lon': float(store['lng']), } yield inputoutput(**properties)
def parse(self, response): # testing response.selector.remove_namespaces() for store_elem in response.xpath('//LocationFinderStore'): city = store_elem.xpath('./City/text()').extract_first() lat = store_elem.xpath('./Latitude/text()').extract_first() lon = store_elem.xpath('./Longitude/text()').extract_first() ref = store_elem.xpath('./StoreNumber/text()').extract_first() addr_full = store_elem.xpath( './StreetAddress/text()').extract_first() zipcode = store_elem.xpath('./ZipCode/text()').extract_first() state = store_elem.xpath('./State/text()').extract_first() name = store_elem.xpath('./Name/text()').extract_first() properties = { 'name': name, 'addr_full': addr_full, 'city': city, 'state': state, 'postcode': zipcode, 'ref': ref, 'website': 'http://locations.in-n-out.com/' + ref, 'lon': float(lon), 'lat': float(lat), } yield inputoutput(**properties)