def parse(handle): # req = urllib2.urlopen(url) soup = BeautifulSoup.BeautifulSoup(handle) result_div = soup.find('div', {'class' : 'tableTopBorder'}) for result in result_div.findAll('tr', {'class' : re.compile('lightBlue')}): try: cells = result.findAll('td') addresses = ''.join([e for e in cells[1].recursiveChildGenerator() if isinstance(e,unicode)]) addresses = addresses.strip().split('\n') name = addresses.pop(0) postcode = addresses.pop(-1) addresses_dict = {} for i, address in enumerate(addresses): i = i+1 addresses_dict["address%s" % i] = address url = cells[-1].a['href'] parsed_url = urlparse.parse_qs(url) lat = parsed_url['lat'] lon = parsed_url['lon'] try: d = Dodger.objects.get(name=name, company=company_id, postcode=postcode) except Exception, e: d = Dodger(name=name, company=company_id, postcode=postcode) d.address1 = addresses_dict.get('address1') d.address2 = addresses_dict.get('address2') d.address3 = addresses_dict.get('address3') d.address4 = addresses_dict.get('address4') d.country = "United Kingdom" d.location = Point(float(lat[0]), float(lon[0])) d.brand = brand d.save() except Exception, e: print brand_name, e
def parse_details(handle): soup = BeautifulSoup.BeautifulSoup(handle) result_div = soup.find('div', {'id' : 'template9_middle_bottom_left'}) for result in result_div.findAll('div', {'class' : re.compile('transBack')}): try: cells = result.findAll('p', {'class' : 'paddingTop10'}) addresses = ''.join([e for e in cells[3].recursiveChildGenerator() if isinstance(e,unicode)]) addresses = addresses.strip().split('\n') name = addresses.pop(0) postcode = addresses.pop(-1) addresses_dict = {} for i, address in enumerate(addresses): i = i+1 addresses_dict["address%s" % i] = address phone = cells[2].contents[1].split(' or ')[0].strip() try: d = Dodger.objects.get(name=name, company=company_id, postcode=postcode) except Exception, e: d = Dodger(name=name, company=company_id, postcode=postcode) print "unknown store" d.phone = phone d.save() opening_times = cells[1] if len(opening_times) > 3: # print opening_times opening_times = ''.join([e for e in opening_times.recursiveChildGenerator() if isinstance(e,unicode)]) opening_times = opening_times.split('\n')[2:] mon = opening_times[0] tue = opening_times[1] wed = opening_times[2] thu = opening_times[3] fri = opening_times[4] sat = opening_times[5] sun = opening_times[6] # Delete all opeing times for this Dodger d.opening_times.all().delete() def parse_open_time(str_time): # Sunday: 1000-1630 if str_time: open_close = str_time.split(':')[1] open_close = open_close.split('-') if len(open_close) >= 2: for i,v in enumerate(open_close): v = v.replace('.', ':').strip() v = list(v) v.insert(2, ':') v = "".join(v) open_close[i] = v return open_close try: # Monday open_close = parse_open_time(mon) if mon and open_close: o = d.opening_times.create(day_of_week=0, open_time=open_close[0], close_time=open_close[1]) # Tuesday open_close = parse_open_time(tue) if tue and open_close: o = d.opening_times.create(day_of_week=1, open_time=open_close[0], close_time=open_close[1]) # Wednesday open_close = parse_open_time(wed) if wed and open_close: o = d.opening_times.create(day_of_week=2, open_time=open_close[0], close_time=open_close[1]) # Thursday open_close = parse_open_time(thu) if thu and open_close: o = d.opening_times.create(day_of_week=3, open_time=open_close[0], close_time=open_close[1]) # Friday open_close = parse_open_time(fri) if fri and open_close: o = d.opening_times.create(day_of_week=4, open_time=open_close[0], close_time=open_close[1]) # Saturday open_close = parse_open_time(sat) if sat and open_close: o = d.opening_times.create(day_of_week=5, open_time=open_close[0], close_time=open_close[1]) # Sunday open_close = parse_open_time(sun) if sun and open_close: o = d.opening_times.create(day_of_week=6, open_time=open_close[0], close_time=open_close[1]) except Exception, e: print "error parsing opening times" print e print open_close
def scrape(): company_id = Dodger.C_ARCADIA brands = ( (12551,'Burton', "http://cloudservices.arcadiagroup.co.uk/storestock/storestock?brand=12551&jsonp_callback=jsonp1292077523475&lat=51.461752&long=-0.114286&dist=50000&res=10000000&_=1292077544710",), (12552,'Dorothy Perkins', "http://cloudservices.arcadiagroup.co.uk/storestock/storestock?brand=12552&jsonp_callback=jsonp1292077523475&lat=51.461752&long=-0.114286&dist=50000&res=10000000&_=1292077544710",), (12553,'Evans', "http://cloudservices.arcadiagroup.co.uk/storestock/storestock?brand=12553&jsonp_callback=jsonp1292077523475&lat=51.461752&long=-0.114286&dist=50000&res=10000000&_=1292077544710",), (12554,'Miss selfridge', "http://cloudservices.arcadiagroup.co.uk/storestock/storestock?brand=12554&jsonp_callback=jsonp1292077523475&lat=51.461752&long=-0.114286&dist=50000&res=10000000&_=1292077544710",), (12555,'Topshop', "http://cloudservices.arcadiagroup.co.uk/storestock/storestock?brand=12555&jsonp_callback=jsonp1292077523475&lat=51.461752&long=-0.114286&dist=50000&res=10000000&_=1292077544710",), (12556,'Topman' , "http://cloudservices.arcadiagroup.co.uk/storestock/storestock?brand=12556&jsonp_callback=jsonp1292077523475&lat=51.461752&long=-0.114286&dist=50000&res=10000000&_=1292077544710",), (12557,'Wallis', "http://cloudservices.arcadiagroup.co.uk/storestock/storestock?brand=12557&jsonp_callback=jsonp1292077523475&lat=51.461752&long=-0.114286&dist=50000&res=10000000&_=1292077544710",), ) for brand_id, brand_name, url in brands: try: brand = Brand.objects.get(brand_id=brand_id) except Brand.DoesNotExist: brand = Brand(brand_id=brand_id, name=brand_name) brand.save() r = urllib2.Request(url, headers={'User-Agent' : "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"}) res = urllib2.urlopen(r) x = res.read()[19:-2] data = json.loads(x) for store in data['stores']['store']: for k,v in store.items(): store[k] = v.encode('utf8') try: d = Dodger.objects.get(company=company_id, doger_id=store['storeId'], brand=brand) except Dodger.DoesNotExist: d = Dodger() d.name = store['storeName'] d.company = company_id d.brand = brand d.doger_id = store['storeId'] d.address1 = store.get('address1') d.address2 = store.get('address2') d.address3 = store.get('address3') d.address4 = store.get('address4') d.postcode = store.get('postcode') d.phone = store.get('telephoneNumber') # d.location = fromstr('POINT(%s, %s)' % (store['latitude'], store['longitude'])) # d.location = 'POINT((%s, %s))' % (store['latitude'], store['longitude']) if float(store['latitude']) and float(store['longitude']): d.location = Point(float(store['latitude']), float(store['longitude'])) d.country = store['country'] d.save() # Delete all opeing times for this Dodger d.opening_times.all().delete() def parse_open_time(str_time): # 08:00-21:00 if str_time: open_close = str_time.split('-') if len(open_close) >= 2: open_close = [v.replace('.', ':') for v in open_close] return open_close try: # Monday open_close = parse_open_time(store.get('openingMon')) if store.get('openingMon') and open_close: o = d.opening_times.create(day_of_week=0, open_time=open_close[0], close_time=open_close[1]) # Tuesday open_close = parse_open_time(store.get('openingTue')) if store.get('openingTue') and open_close: o = d.opening_times.create(day_of_week=1, open_time=open_close[0], close_time=open_close[1]) # Wednesday open_close = parse_open_time(store.get('openingWed')) if store.get('openingWed') and open_close: o = d.opening_times.create(day_of_week=2, open_time=open_close[0], close_time=open_close[1]) # Thursday open_close = parse_open_time(store.get('openingThu')) if store.get('openingThu') and open_close: o = d.opening_times.create(day_of_week=3, open_time=open_close[0], close_time=open_close[1]) # Friday open_close = parse_open_time(store.get('openingFri')) if store.get('openingFri') and open_close: o = d.opening_times.create(day_of_week=4, open_time=open_close[0], close_time=open_close[1]) # Saturday open_close = parse_open_time(store.get('openingSat')) if store.get('openingSat') and open_close: o = d.opening_times.create(day_of_week=5, open_time=open_close[0], close_time=open_close[1]) # Sunday open_close = parse_open_time(store.get('openingSun')) if store.get('openingSun') and open_close: o = d.opening_times.create(day_of_week=6, open_time=open_close[0], close_time=open_close[1]) except Exception, e: print e print open_close