def fetch_uk(body, data): start = body.find(u'<div class="fableft">') if start == -1: print 'Error in finding %s stores' % data['name'] return [] body, start, end = cm.extract_closure(body[start:], ur'<div\b', ur'</div>') if end == 0: print 'Error in finding %s stores' % data['name'] return [] store_list = [] for m in re.findall(ur'<div>\s*(.+?)\s*</div>', body, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['name'] addr_list = re.findall(ur'<p>\s*(.+?)\s*</p>', m) tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] if data['name'] == 'AUSTRALIA': country, province, city = gs.addr_sense(', '.join(addr_list), data['name']) if city is not None: entry[cm.city_e] = city if province is not None: entry[cm.province_e] = province else: city = addr_list[-2].strip().upper() entry[cm.city_e] = city ret = gs.look_up(city, 3) if ret is not None and ret['country']['name_e'] == gs.look_up( 'UK', 1)['name_e']: entry[cm.city_e] = ret['name_e'] entry[cm.zip_code] = addr_list[-1].strip().upper() entry[cm.addr_e] = ', '.join(addr_list) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_store_detail(s, data, isOfficial=False): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = cm.html2plain(s['name']).strip() entry[cm.country_e] = data['country'] val = cm.html2plain(s['city']).strip().upper() entry[cm.city_e] = cm.extract_city( val if val and val != '' else data['city'])[0] entry[cm.addr_e] = cm.html2plain(s['address']).strip() entry[cm.email] = s['email'].strip() entry[cm.tel] = s['phone'].strip() entry[cm.fax] = s['fax'].strip() entry[cm.store_class] = 'Official Retailer' if isOfficial else 'Retailer' try: entry[cm.lat] = string.atof(s['lat']) if s['lat'] != '' else '' except (ValueError, KeyError, TypeError) as e: cm.dump('Error in fetching lat: %s' % str(e), log_name) try: entry[cm.lng] = string.atof(s['lng']) if s['lng'] != '' else '' except (ValueError, KeyError, TypeError) as e: cm.dump('Error in fetching lng: %s' % str(e), log_name) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) return entry
def fetch_stores(data): url = data['store_url'] try: body = cm.get_data(url, { 'country': data['country'], 'city': data['city'] }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] raw = json.loads(body) store_list = [] for item in raw['items']: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'].strip().upper() tmp = cm.extract_city(data['city'])[0] if entry[cm.country_e] == 'USA': entry[cm.province_e] = tmp else: entry[cm.city_e] = tmp gs.field_sense(entry) addr = cm.reformat_addr(item['address'].replace(u'\\', '')) addr_list = [tmp.strip() for tmp in addr.split(',')] tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) entry[cm.store_type] = item['shop_type'] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch_store_details(data): url = data['host'] + data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching store details: %s' % url, log_name) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) start = body.find(ur'<h3>available in store</h3>') if start != -1: type_sub = cm.extract_closure(body[start:], ur'<ul\b', ur'</ul>')[0] entry[cm.store_type] = ', '.join( cm.html2plain(tmp).strip() for tmp in re.findall(ur'<li[^<>]*>(.+?)</li>', type_sub, re.S)) start = body.find(ur"<div class='gmap_info_box'") if start == -1: cm.dump('Error in fetching store details: %s' % url, log_name) return [] body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] raw = json.loads(cm.extract_closure(body, ur'\{', ur'\}')[0])['table'] entry[cm.name_e] = cm.html2plain(raw['name']) entry[cm.city_e] = data['city'].strip().upper() entry[cm.country_e] = data['country'].strip().upper() # entry[cm.store_type] = data['store_type'] entry[cm.addr_e] = cm.reformat_addr(raw['address']) m = re.search(re.compile(ur'phone:(.*?)fax:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.tel] = m.group(1).strip() entry[cm.fax] = m.group(2).strip() else: m = re.search(re.compile(ur'phone:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.tel] = m.group(1).strip() m = re.search(re.compile(ur'fax:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.fax] = m.group(1).strip() entry[cm.hours] = raw['hours'] if raw['lat'] is not None and raw['lat'] != '': entry[cm.lat] = string.atof(raw['lat']) if raw['lng'] is not None and raw['lng'] != '': entry[cm.lat] = string.atof(raw['lng']) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None: entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') return [entry]
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] store_list = [] for m1 in re.finditer(ur'<lignecountry\s+titre\s*=\s*"([^"]+)"', body): country = m1.group(1).strip().upper() if country == 'U.S.A.': country = 'US' sub_country = cm.extract_closure(body[m1.start():], ur'<lignecountry\b', ur'</lignecountry>')[0] for m2 in re.finditer(ur'<lignecity\s+titre\s*=\s*"([^"]+)"', sub_country): city = m2.group(1).strip().upper() sub_city = cm.extract_closure(sub_country[m2.start():], ur'<lignecity\b', ur'</lignecity>')[0] m3 = re.search(ur'<!\[CDATA\[(.+?)\]\]>', sub_city, re.S) if m3 is None: continue sub_city = m3.group(1) store_subs = re.split(ur'<\s*h2\s*>\s*LANVIN BOUTIQUE\s*<\s*/h2\s*>', sub_city) for s in store_subs: if s.strip() == '': continue m4 = re.search(ur'<p>(.+?)</p>', s, re.S) if m4 is None: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = country entry[cm.city_e] = city s = m4.group(1) m4 = re.search(ur'(.+?)\n\s*\n', s, re.S) if m4 is not None: entry[cm.addr_e] = cm.reformat_addr(m4.group(1)) m4 = re.search(ur'Phone:(.+?)\n\s*\n', s, re.S) if m4 is not None: entry[cm.tel] = cm.reformat_addr(m4.group(1).strip()) m4 = re.search(ur'Boutique Hours:(.+?)\n\s*\n', s, re.S) if m4 is not None: entry[cm.hours] = cm.reformat_addr(m4.group(1).strip()) m4 = re.search(ur'Products available:(.+?)\n\s*\n', s, re.S) if m4 is not None: entry[cm.store_type] = m4.group(1).strip() m4 = re.search(ur'Email:\s*<a href="mailto:([^"]+)">', s) if m4 is not None: entry[cm.email] = m4.group(1).strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_stores(data): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) code = data['country_code'] if gs.look_up(code, 1) is None: entry[cm.country_e] = cm.html2plain(data['country']).strip().upper() else: entry[cm.country_e] = code entry[cm.name_e] = data['store_name'] entry[cm.city_e] = cm.extract_city(data['city'])[0] entry[cm.lat] = data['lat'] if data['lat'] is not None else '' entry[cm.lng] = data['lng'] if data['lng'] is not None else '' m = re.search(ur'data-boutique\s*=\s*"%s"' % data['store_id'], data['content']) sub = data['content'][m.end():] m1 = re.search(ur'<li class="isDistributeur[^<>]+>(.+?)</li>', sub) if m1 is not None: entry[cm.store_class] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<li class="place-title[^<>]+>(.+?)</li>', sub, re.S) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<li class="contacts[^<>]+>(.+?)</li>', sub, re.S) if m1 is not None: m2 = re.search(ur'<a class="popupLaunch" href="([^"]+)"', m1.group(1)) if m2: entry = fetch_details(data, m2.group(1), entry) m2 = re.search(ur'<p>(.+?)</p>', m1.group(1), re.S) if m2: ct_list = tuple( tmp.strip() for tmp in cm.reformat_addr(m2.group(1)).split(',')) entry[cm.tel] = cm.extract_tel(ct_list[0]) if len(ct_list) > 1: entry[cm.email] = ct_list[1].strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') return tuple(entry)
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error occured in fetching stores: %s' % url, 'canali_log.txt') dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] store_list = [] for m in re.finditer(ur'<div class="storeInfo">', body): sub = cm.extract_closure(body[m.start():], ur'<div\b', ur'</div>')[0] m1 = re.search(ur'<span itemprop="streetAddress">(.+?)</span>', sub) if m1 is None: cm.dump('Error: failed to find the address: %s' % url, 'canali_log.txt') continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) entry[cm.country_e] = data['country'] entry[cm.city_e] = data['city'] ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None: entry[cm.province_e] = ret[1] m1 = re.search(ur'<span itemprop="telephone">(.+?)</span>', sub) if m1 is not None: entry[cm.tel] = m1.group(1).strip() m1 = re.search(ur'data-latitude="(.+?)"', sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) m1 = re.search(ur'data-longitude="(.+?)"', sub) if m1 is not None: entry[cm.lng] = string.atof(m1.group(1)) gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), 'canali_log.txt') db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_stores(data): url = data['store_url'] param = {'myid': data['key'], 'idioma': 'in'} try: body = cm.get_data(url, param) except Exception: cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name) return [] store_list = [] for s in json.loads(body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] entry[cm.city_e] = cm.extract_city(data['city'])[0] entry[cm.name_e] = cm.reformat_addr(s['title']) m = re.search(ur'(.+?)-\s*<', s['key']) addr_list = [entry[cm.name_e]] if m is not None: m1 = re.search(ur'-+', m.group(1)) if m1 is not None: tmp = [m.group(1)[:m1.start()], m.group(1)[m1.end():]] else: tmp = [m.group(1)] if len(tmp) > 1: entry[cm.tel] = cm.extract_tel(tmp[1]) m1 = re.search(ur'\d{4,}', tmp[0]) if m1 is not None: entry[cm.zip_code] = m1.group() addr_list.append(tmp[0].strip()) entry[cm.addr_e] = ', '.join(addr_list) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch_hk(data): loc_list = ('Hong Kong', 'Kowloon', 'Macau', 'New Territories') url = 'http://levi.com.hk/hk/storelocator' store_list = [] for loc in loc_list: param = {'loc': loc} try: body = cm.get_data(url, param) except Exception, e: cm.dump('Error in fetching stores: %s' % param, log_name) continue start = body.find(ur'<div id="addWrapper">') if start == -1: cm.dump('Error in fetching stores: %s' % param, log_name) continue sub = cm.extract_closure(body[start:], ur'<ul>', ur'</ul>')[0] for s in re.findall(ur'<li>(.+?)</li>', sub, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = 'MACAU' if loc == 'Macau' else 'HONG KONG' entry[cm.city_e] = entry[cm.country_e] m = re.search(ur'<div id="addStore">([^<>]+)', s) entry[cm.addr_e] = cm.html2plain(m.group(1)) if m else '' m = re.search(ur'<div id="addAddress">([^<>]+)', s) tmp = cm.html2plain(m.group(1)) pat = re.compile(ur'business hours?\s*[:\.]?\s*', re.I) if re.search(pat, tmp): entry[cm.hours] = re.sub(pat, '', tmp).strip() m = re.search(ur'<div id="addPhone">([^<>]+)', s) tmp = cm.html2plain(m.group(1)) pat = re.compile(ur'(tel|phone|telephone)?\s*[:\.]?\s*', re.I) if re.search(pat, tmp): entry[cm.tel] = re.sub(pat, '', tmp).strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def func(item): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = cm.html2plain(item('h6')[0].text).strip() addr_sub = unicode(pq(item('p')[0])) addr_list = [ term.strip() for term in cm.reformat_addr(addr_sub).split(',') ] tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) temp = item('a.track_map[href]') m = hashlib.md5() m.update(url) if len(temp) > 0: map_ref = temp[0].attrib['href'] m.update(map_ref) m_query = re.search(r'q=([^;]+?)&', cm.html2plain(map_ref)) if m_query: query_parm = m_query.group(1).replace('+', ' ') entry['geo_query_param'] = query_parm else: m.update(entry[cm.addr_e]) fingerprint = m.hexdigest() entry[cm.native_id] = fingerprint if entry[cm.native_id] in data['store_list']: return entry[cm.country_e] = data['country'] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) logger.info( ('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]))) cm.insert_record(db, entry, data['table']) return entry
def fetch_stores(data): url = data['home_url'] try: body = cm.post_data(url, {'lz_sf': data['province'], 'lz_sx': data['city']}) except Exception: cm.dump('Error in fetching stores: %s, %s, %s' % (url, data['province'], data['city']), 'samsonite_log.txt') dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] start = body.find(u'搜索结果') if start == -1: cm.dump('Error in fetching stores: %s, %s, %s' % (url, data['province'], data['city']), 'samsonite_log.txt') return [] body = body[start + 4:] store_list = [] for m in re.findall(ur'</script>\s*(\S+)\s*</span>', body, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = m.strip() entry[cm.addr_e] = m.strip() entry[cm.city_c] = data['city'] ret = gs.look_up(data['city'], 3) if ret is not None: entry[cm.city_e] = cm.extract_city(ret['name_e'])[0] if ret['province'] != '': entry[cm.province_e] = ret['province']['name_e'] entry[cm.province_c] = data['province'] ret = gs.look_up(data['province'], 2) if ret is not None: entry[cm.province_e] = ret['name_e'] entry[cm.country_e] = u'CHINA' gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), 'benetton_log.txt', False) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_stores_cn(data): vals = ['Storenew%d.html' % idx for idx in xrange(1, 10)] vals.append('Store.html') store_list = [] for url in ('http://www.hushpuppies.com.cn/%s' % term for term in vals): try: body = cm.get_data(url) except Exception, e: cm.dump('Error in fetching stores: %s' % url, log_name) return () city_map = dict((m[0].strip(), m[1].strip()) for m in re.findall( ur'<a rel="([^"]+)" href="#"[^<>]*>([^<>]+)', body)) start = body.find(ur'<div id="all-list-wrap" style="float:left">') if start == -1: cm.dump('Error in fetching stores: %s' % url, log_name) return () sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] for m in re.findall(ur'<ul id="([^"]+)"[^<>]*>(.+?)</ul>', sub, re.S): city = city_map[m[0].strip()] for store in re.findall(ur'<li><a>([^<>]+)', m[1]): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = u'CHINA' entry[cm.city_e] = city entry[cm.addr_e] = store.strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_stores(db, data, logger): q = pq(url='http://www.paulandjoe.com/en/ozcms/stores/list/?country_id=&postcode=') store_list = [] # Country country_a = q('#store_list>li>a') country_b = q('#store_list>li>ul') assert (len(country_a) == len(country_b)) for i in xrange(len(country_a)): country = country_a[i].text.strip().upper() store_a = pq(country_b[i])('a.marker-store') store_b = pq(country_b[i])('span.store-infos') assert (len(store_a) == len(store_b)) for j in xrange(len(store_a)): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) lat = store_a[j].attrib['data-latitude'] lat = float(lat) if lat else None lng = store_a[j].attrib['data-longitude'] lng = float(lng) if lng else None if lat and lng: entry[cm.lat], entry[cm.lng] = lat, lng entry[cm.name_e] = store_a[j].text entry[cm.addr_e] = cm.reformat_addr(str(pq(store_b[j]))) # cm.reformat_addr(str(store_b[j])) entry[cm.country_e] = country gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) logger.info('(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])) store_list.append(entry) cm.insert_record(db, entry, 'spider_stores.stores') return tuple(store_list)
def fetch_cn(data): url = 'http://www.lee.com.cn/xml/storefinder.xml' store_list = [] for store in (pq(tmp) for tmp in pq(url=url)('NewDataSet Table')): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) tmp = store('shop_name')[0] entry[cm.name_e] = tmp.text if tmp.text else '' entry[cm.country_e] = 'CHINA' tmp = store('city')[0] entry[cm.city_e] = tmp.text if tmp.text else '' tmp = store('province')[0] entry[cm.province_e] = tmp.text if tmp.text else '' tmp = store('district')[0] entry[cm.district_e] = tmp.text if tmp.text else '' tmp = store('address')[0] entry[cm.addr_e] = tmp.text if tmp.text else '' tmp = store('tel')[0] entry[cm.tel] = tmp.text if tmp.text else '' gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry) return tuple(store_list)
def fetch_stores(data): url = data['store_url'] try: html = cm.get_data(url, {'nazione': data['country_e'], 'citta': data['city_e'], 'tipo': 'tutti'}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for m in re.finditer(ur'<marker\b', html): sub, start, end = cm.extract_closure(html[m.start():], ur'<marker\b', ur'</marker>') if end == 0: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m1 = re.search(ur'name\s*=\s*"(.+?)"', sub) if m1 is not None: entry[cm.name_e] = cm.html2plain(m1.group(1)).strip() addr_list = [] m1 = re.search(ur'address\s*=\s*"(.+?)"', sub) if m1 is not None and cm.html2plain(m1.group(1)).strip() != '': addr_list.append(cm.html2plain(m1.group(1)).strip()) m1 = re.search(ur'addr2\s*=\s*"(.+?)"', sub) if m1 is not None and cm.html2plain(m1.group(1)).strip() != '': addr_list.append(cm.html2plain(m1.group(1)).strip()) entry[cm.addr_e] = ', '.join(addr_list) m1 = re.search(ur'city\s*=\s*"(.+?)"', sub) if m1 is not None: entry[cm.city_e] = cm.html2plain(m1.group(1)).strip().upper() m1 = re.search(ur'country\s*=\s*"(.+?)"', sub) if m1 is not None: entry[cm.country_e] = cm.html2plain(m1.group(1)).strip().upper() m1 = re.search(ur'zipcode\s*=\s*"(.+?)"', sub) if m1 is not None: entry[cm.name_e] = cm.html2plain(m1.group(1)).strip() m1 = re.search(ur'phone\s*=\s*"(.+?)"', sub) if m1 is not None: entry[cm.tel] = cm.html2plain(m1.group(1)).strip() m1 = re.search(ur'email\s*=\s*"(.+?)"', sub) if m1 is not None: entry[cm.email] = cm.html2plain(m1.group(1)).strip() m1 = re.search(ur'website\s*=\s*"(.+?)"', sub) if m1 is not None: entry[cm.url] = cm.html2plain(m1.group(1)).strip() m1 = re.search(ur'lat\s*=\s*"(.+?)"', sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) m1 = re.search(ur'lng\s*=\s*"(.+?)"', sub) if m1 is not None: entry[cm.lng] = string.atof(m1.group(1)) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def fetch_stores(data): param = { 'action': 'getStoresFromAjax', 'country': data['country_code'], 'region': data['city'], 'collection': '' } url = data['url'] try: body = cm.post_data(url, param) except Exception: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return [] store_list = [] for m1 in re.finditer(ur'<div class="shop-type-container">', body): sub = cm.extract_closure(body[m1.start():], ur'<div\b', ur'</div>')[0] store_class = '' m2 = re.search(ur'<div class="shop-type-title">(.+?)</div>', sub, re.S) if m2 is not None: store_class = cm.reformat_addr(m2.group(1)) for m2 in re.finditer(ur'<div class="shop"', sub): store_sub = cm.extract_closure(sub[m2.start():], ur'<div\b', ur'</div>')[0] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.store_class] = store_class entry[cm.country_e] = data['country_code'] entry[cm.city_e] = cm.extract_city(data['city'])[0] m3 = re.search( ur'loadStore\((\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\)', store_sub) if m3 is not None: data['store_id'] = string.atoi(m3.group(1)) entry[cm.lat] = string.atof(m3.group(2)) entry[cm.lng] = string.atof(m3.group(3)) entry[cm.store_type] = ', '.join(get_detail(data)) m3 = re.search( ur'<div class="shop-name shop-details shop-main-name">([^<>]+)</div>', store_sub) if m3 is not None: entry[cm.name_e] = m3.group(1).strip() addr_list = [] m3 = re.search( ur'<div class="shop-street shop-details">([^<>]+)</div>', store_sub) if m3 is not None: addr_list.append(cm.reformat_addr(m3.group(1))) m3 = re.search( ur'<div class="shop-city shop-details">([^<>]+)</div>', store_sub) if m3 is not None: tmp = cm.reformat_addr(m3.group(1)) m3 = re.search(ur'(\d{4,})', tmp) if m3 is not None: entry[cm.zip_code] = m3.group(1).strip() addr_list.append(tmp) entry[cm.addr_e] = ', '.join(addr_list) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def get_store_details(data): url = data['url'] try: html = cm.post_data( url, { 'country': data['country_id'], 'city': data['city_id'], 'recordid': data['store_id'] }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } cm.dump(dump_data) return [] entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) info = json.loads(html)['elements'] addr = cm.reformat_addr(info['address'].replace('\\', '').replace( '<p>', ',').replace('</p>', ',')) # 第一行为商店名称 terms = addr.split(',') if len(terms) > 0: entry[cm.name_e] = cm.reformat_addr(terms[0]) entry[cm.addr_e] = addr gmap_url = info['gmap'] m = re.findall(ur'(-?\d+\.\d+),(-?\d+\.\d+)', gmap_url) if len(m) > 0: cm.update_entry(entry, { cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1]) }) entry[cm.url] = info['shareurl'].replace('\\', '') entry[cm.hours] = info['openingtimes'] entry[cm.comments] = info['other'] # Geo country = data['country'] city = data['city'] cm.update_entry(entry, {cm.country_e: country, cm.city_e: city}) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return entry
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] raw = json.loads(body) store_list = [] for s in raw: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = s['storename'] entry[cm.addr_e] = cm.reformat_addr(', '.join([s['building'].replace(u'operated by ', u''), s['street'].strip()])) if s['country'] is not None: entry[cm.country_e] = s['country'].strip().upper() if s['city'] is not None: if s['country'].strip() == u'US': tmp = s['city'].split(',') entry[cm.city_e] = tmp[0].strip().upper() if len(tmp) > 1: entry[cm.province_e] = tmp[1].strip().upper() else: entry[cm.city_e] = s['city'].strip().upper() entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] if s['zip'] is not None: entry[cm.zip_code] = s['zip'].strip() if s['phone'] is not None: entry[cm.tel] = s['phone'].strip() if s['storeemail'] is not None: entry[cm.email] = s['storeemail'].strip() if s['storelink'] is not None and u'@' not in s['storelink']: entry[cm.url] = s['storelink'].strip() if s['storetype'] is not None: entry[cm.store_class] = s['storetype'].strip() hours = [] for item in ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']: if s[item] is not None: hours.append('%s: %s' % (item, s[item])) entry[cm.hours] = ', '.join(hours) styles = [] for item in ['menswear', 'womenswear', 'kidswear']: if s[item] == '1': styles.append(item) entry[cm.store_type] = ', '.join(styles) if s['latitude'] is not None and s['latitude'].strip() != '': entry[cm.lat] = string.atof(s['latitude']) if s['longitude'] is not None and s['longitude'].strip() != '': entry[cm.lng] = string.atof(s['longitude']) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch_stores(data): url = data['data_url'] param = { 'output': 'json', 'country': data['country_code'], 'brand': 'dkny' } page = 0 tot_page = -1 store_list = [] while True: page += 1 if tot_page != -1 and page > tot_page: break param['p'] = page try: body = cm.get_data(url, param) except Exception, e: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return () raw = json.loads(body) tot_page = raw['Stores']['TotalPages'] if data['country_code'] not in region_map: # 构造州列表 region_map[data['country_code']] = dict( (item['RegionId'], item['Name']) for item in raw['Regions']) for s in raw['Stores']['Items']: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country_code'].upper() entry[cm.city_e] = cm.extract_city(s['City'])[0] entry[cm.name_e] = cm.html2plain(s['Name']).strip() entry[cm.addr_e] = cm.reformat_addr(s['Address']) entry[cm.tel] = s['Phone'].strip() if s['Phone'] else '' entry[cm.fax] = s['Fax'].strip() if s['Fax'] else '' entry[cm.email] = s['Email'].strip() if s['Email'] else '' entry[cm.lat] = s['Latitude'] if s['Latitude'] else '' entry[cm.lng] = s['Longitude'] if s['Longitude'] else '' region_id = s['RegionId'] if region_id in region_map[data['country_code']]: entry[cm.province_e] = cm.html2plain(region_map[ data['country_code']][region_id]).strip().upper() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def f(m): store_name = m[0].strip() addr_str = m[1].strip() spl = addr_str.split('<br/>') store_type = cm.html2plain(spl[0].strip()) store_addr = spl[1].strip() hour_idx = 2 store_tel = '' for i in xrange(2, len(spl)): # If this is not a phone number: tel = cm.extract_tel(spl[i]) if tel == '': store_addr += ', ' + spl[i] hour_idx = i + 1 else: store_tel = spl[i].strip() hour_idx = i + 1 break if hour_idx < len(spl): store_hour = cm.html2plain(', '.join(spl[hour_idx:])).strip() else: store_hour = '' # store_addr = cm.reformat_addr('\r\n'.join([val.strip() for val in spl[1:-3]])) store_addr = cm.reformat_addr(store_addr) store_entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry( store_entry, { cm.continent_e: opt[cm.continent_e].strip().upper(), cm.city_e: opt[cm.city_e].strip().upper(), cm.country_e: opt[cm.country_e].strip().upper(), cm.name_e: cm.name_e, cm.addr_e: store_addr, cm.store_type: store_type, cm.hours: store_hour, cm.tel: store_tel }) if opt.has_key(cm.province_e): store_entry[cm.province_e] = opt[cm.province_e] else: store_entry[cm.province_e] = '' store_entry[cm.city_e] = cm.extract_city(store_entry[cm.city_e])[0] gs.field_sense(store_entry) ret = gs.addr_sense(store_entry[cm.addr_e], store_entry[cm.country_e]) if ret[1] is not None and store_entry[cm.province_e] == '': store_entry[cm.province_e] = ret[1] if ret[2] is not None and store_entry[cm.city_e] == '': store_entry[cm.city_e] = ret[2] gs.field_sense(store_entry) print '%s Found store: %s, %s (%s, %s)' % ( brandname_e, store_entry[cm.name_e], store_entry[cm.addr_e], store_entry[cm.country_e], store_entry[cm.continent_e]) db.insert_record(store_entry, 'stores') return store_entry
store_list = [] for item in country_sub: body = item['content'] country = item['name'] for m in re.findall(ur'<div>\s*(.+?)\s*</div>', body, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = country addr_list = re.findall(ur'<p>\s*(.+?)\s*</p>', m) tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] ret = gs.addr_sense(', '.join(addr_list)) if ret[2] is not None: entry[cm.city_e] = ret[2] if ret[1] is not None: entry[cm.province_e] = ret[1] entry[cm.addr_e] = ', '.join(addr_list) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry)
val = s['website'] entry[cm.url] = cm.html2plain(val).strip() if val else '' try: val = s['lat'] entry[cm.lat] = string.atof(val) if val and val != '' else '' except (ValueError, KeyError, TypeError) as e: cm.dump('Error in fetching lat: %s' % str(e), log_name) try: val = s['lng'] entry[cm.lng] = string.atof(val) if val and val != '' else '' except (ValueError, KeyError, TypeError) as e: cm.dump('Error in fetching lng: %s' % str(e), log_name) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) if entry[cm.country_e] == '' or entry[cm.city_e] == '': ret = None location_valid = True if entry[cm.lat] != '' and entry[cm.lng] != '': ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng])) if ret is None:
def fetch_stores(data): url = data['url'] try: body = cm.post_data(url, { 'rsp': 'json', 'country': data['country_code'] }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] raw = json.loads(body) store_list = [] for s in raw['stores']: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = cm.html2plain(s['name']).strip() addr_list = [] for key in ['address1', 'address2']: if s[key].strip() != '': addr_list.append(cm.reformat_addr(s[key])) entry[cm.addr_e] = ' '.join(addr_list) # r=s['region'].strip().upper() # m = re.search(ur'\b([A-Z]{2})\b', r) # if data[cm.country_e]=='UNITED STATES' and m is not None: # # 美国 # ret = gs.look_up(m.group(1), 2) # if ret is not None: # r = ret['name_e'] # entry[cm.province_e] = r entry[cm.city_e] = cm.extract_city(s['city'])[0] entry[cm.zip_code] = s['zip'].strip() entry[cm.country_e] = data[cm.country_e] entry[cm.lat] = string.atof(s['lat']) entry[cm.lng] = string.atof(s['lng']) entry[cm.tel] = s['phone'].strip() entry[cm.fax] = s['fax'].strip() entry[cm.email] = s['emailaddress'].strip() entry[cm.url] = s['website'].strip() days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] opening = [] if 'openingHours' in s and s['openingHours'] is not None: for m in re.finditer(ur'i:(\d);s:\d+:\\?"([^\\"]+?)\\?"', s['openingHours']): opening.append( '%s: %s' % (days[string.atoi(m.group(1))], m.group(2).strip())) entry[cm.hours] = ', '.join(opening) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url, hdr={'Accept': 'application/json'}) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] body = cm.extract_closure(body, ur'\{', ur'\}')[0] raw = json.loads( body)[u'storesCompleteResponse'][u'storesComplete'][u'storeComplete'] if not isinstance(raw, list): raw = [raw] store_list = [] for s in raw: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) if 'name' in s and s['name'] is not None: tmp = s['name'] if isinstance(tmp, str) or isinstance(tmp, unicode): entry[cm.name_e] = cm.html2plain(s['name']) if 'address' in s and s['address'] is not None: tmp = s['address'] if 'addressLine' in tmp and tmp['addressLine'] is not None: tmp = tmp['addressLine'] if isinstance(tmp, list): for i in xrange(len(tmp)): tmp[i] = unicode(tmp[i]) entry[cm.addr_e] = ', '.join(tmp) else: entry[cm.addr_e] = unicode(tmp) entry[cm.country_e] = data['country_code'] if 'latitude' in s and s[ 'latitude'] is not None and s['latitude'] != '': entry[cm.lat] = string.atof(s['latitude']) if 'longitude' in s and s[ 'longitude'] is not None and s['longitude'] != '': entry[cm.lng] = string.atof(s['longitude']) if 'openingHours' in s and s['openingHours'] is not None: tmp = s['openingHours'] if tmp is not None and 'openingHour' in tmp: tmp = tmp['openingHour'] if tmp is not None and isinstance(tmp, list): entry[cm.hours] = ', '.join(tmp) if 'phone' in s and s['phone'] is not None: entry[cm.tel] = s['phone'] if 'region' in s and s['region'] is not None and 'name' in s['region']: tmp = s['region']['name'] if tmp is not None: entry[cm.province_e] = tmp.strip().upper() if 'city' in s and s['city'] is not None: entry[cm.city_e] = cm.extract_city(s['city'])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch_dior_beauty(data): url = data['url'] store_list = [] with open('city_lite.dat', 'r') as f: sub = f.readlines() city_map = json.loads(sub[0]) country = 'CHINA' for city in city_map[country]: param = {'cityName': city} cm.dump('Searching at %s, %s' % (city, country), log_name) try: body = cm.post_data(url, param) except Exception, e: cm.dump('Error in fetching states: %s, %s' % (url, param), log_name) continue m = re.search(ur'var\s+Json\s*=', body) if not m: continue sub = cm.extract_closure(body[m.end():], ur'\{', ur'\}')[0] for store in json.loads(sub)['content']['items']: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = country entry[cm.comments] = 'BEAUTY' addr_list = [] val = store['addressLine1'] if val: addr_list.append(cm.html2plain(val).strip()) val = store['addressLine2'] if val: addr_list.append(cm.html2plain(val).strip()) entry[cm.addr_e] = ', '.join(addr_list) val = store['name'] entry[cm.name_e] = cm.html2plain(val).strip() if val else '' val = store['type'] entry[cm.store_class] = cm.html2plain(val).strip() if val else '' val = store['url'] entry[cm.url] = cm.html2plain(val).strip() if val else '' val = store['city'] entry[cm.city_e] = cm.html2plain(val).strip().upper() if val and val != '' else '' val = store['zipcode'] entry[cm.zip_code] = cm.html2plain(val).strip() if val else '' val = store['phone'] entry[cm.tel] = cm.html2plain(val).strip() if val else '' val = store['fax'] entry[cm.fax] = cm.html2plain(val).strip() if val else '' coords = store['coords'] if coords: try: entry[cm.lat] = string.atof(str(coords['lat'])) except (ValueError, KeyError, TypeError) as e: cm.dump('Error in fetching lat: %s' % str(e), log_name) try: entry[cm.lng] = string.atof(str(coords['lng'])) except (ValueError, KeyError, TypeError) as e: cm.dump('Error in fetching lng: %s' % str(e), log_name) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) uid = u'%s|%s|%s|%s|%s,%s' % ( entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], unicode(entry[cm.lat]), unicode(entry[cm.lng])) if uid in store_map: cm.dump(u'%s already exists.' % uid) continue else: store_map[uid] = entry cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_stores(data): """ 获得门店的详细信息 :rtype : [entries] :param data: """ try: html = cm.get_data(data['url']) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } cm.dump(dump_data) return [] entries = [] start = html.find(u'<ul class="store-list">') if start == -1: return entries start += len(u'<ul class="store-list">') end = html.find(u'</ul>', start) html = html[start:end] for m1 in re.findall(ur'<li class="(.*?)">(.*?)</li>', html, re.S): store = cm.init_store_entry(brand_id, brandname_e, brandname_c) store[cm.store_type] = m1[0] sub_html = m1[1] m2 = re.findall(ur'<h3 class="store-name">(.*?)</h3>', sub_html) if len(m2) > 0: store[cm.name_e] = cm.reformat_addr(m2[0]) m2 = re.findall(ur'<p class="store-address">(.*?)</p>', sub_html, re.S) if len(m2) > 0: store[cm.addr_e] = cm.reformat_addr(m2[0]) cm.update_entry( store, { cm.continent_e: data[cm.continent_e].strip().upper(), cm.country_e: data[cm.country_e].strip().upper(), cm.city_e: data[cm.city_e].strip().upper() }) entry = store gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] print '%s: Found store: %s, %s (%s, %s)' % ( brandname_e, store[cm.name_e], store[cm.addr_e], store[cm.country_e], store[cm.continent_e]) db.insert_record(store, 'stores') entries.append(store)
start = sub.find(ur'<dt>') m2 = re.search(ur'<dd>(.+?)</dd>', sub[m1.end():start], re.S) if m2 is not None: entry[cm.hours] = m2.strip() m1 = re.search(ur'<dt>Store Carries</dt>', sub) if m1 is not None: entry[cm.store_type] = cm.reformat_addr( cm.extract_closure(sub[m1.end():], ur'<dd>', ur'</dd>')[0]) m1 = re.search(ur'<ul class="store-links">', sub) if m1 is not None: m2 = re.search(ur'<a href="([^"]+)"', sub[m1.end():]) if m2 is not None: entry[cm.url] = m2.group(1) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) # db.insert_record(entry, 'stores') store_list.append(entry) return store_list
def fetch_stores(data): url = data['data_url'] param = {'lang': data['lang'], 'country': data['country_id'], 'region': data['region_id'], 'city': data['city_id']} try: body = cm.get_data(url, param) except Exception: cm.dump('Error in fetching stores: %s, %s' % (url, param), 'tudor_log.txt') dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for store in (pq(tmp) for tmp in pq(body.encode('utf-8'))('dealer')): try: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country_code'] entry[cm.province_e] = data['region_name'].replace('PROVINCE', '').strip() entry[cm.city_e] = data['city_name'] store_id = store[0].attrib['id'] if store_id in id_set: if data['country_code'] == 'CN': entry = id_set[store_id] entry[cm.name_c] = cm.reformat_addr(store('name')[0].text).strip() tmp = store('address') entry[cm.addr_c] = cm.reformat_addr(tmp[0].text) if len(tmp) > 0 and tmp[0].text else '' entry[cm.province_c] = data['region_name'] entry[cm.city_c] = data['city_name'] db.execute(u'DELETE FROM stores WHERE brand_id=%d AND native_id="%s"' % ( data['brand_id'], entry[cm.native_id])) db.insert_record(entry, 'stores') else: entry[cm.native_id] = store_id entry[cm.name_e] = cm.reformat_addr(store('name')[0].text).strip() tmp = store('address') entry[cm.addr_e] = cm.reformat_addr(tmp[0].text) if len(tmp) > 0 and tmp[0].text else '' tmp = store('phone1') entry[cm.tel] = tmp[0].text.strip() if len(tmp) > 0 and tmp[0].text else '' tmp = store('fax1') entry[cm.fax] = tmp[0].text.strip() if len(tmp) > 0 and tmp[0].text else '' tmp = store('latitude') try: entry[cm.lat] = string.atof(tmp[0].text) if len(tmp) > 0 and tmp[0].text else '' except (ValueError, KeyError, TypeError) as e: cm.dump('Error in fetching lat: %s' % str(e), log_name) tmp = store('longitude') try: entry[cm.lng] = string.atof(tmp[0].text) if len(tmp) > 0 and tmp[0].text else '' except (ValueError, KeyError, TypeError) as e: cm.dump('Error in fetching lng: %s' % str(e), log_name) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e]) if ret[0] is not None and entry[cm.country_e] == '': entry[cm.country_e] = ret[0] if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), 'tudor_log.txt') db.insert_record(entry, 'stores') id_set[store_id] = entry store_list.append(entry) except (IndexError, TypeError) as e: print traceback.format_exc() continue return store_list
def fetch_store_details(data): url = data['url'] try: body = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) ret = gs.look_up(data['country'], 1) if ret is not None: entry[cm.country_e] = ret['name_e'] m = re.search(ur'<span class="type">Address</span>\s*<p>(.+?)</p>', body, re.S) if m is not None: addr = cm.reformat_addr(m.group(1)) country, province, city = gs.addr_sense(addr) if country is not None and entry[cm.country_e] == '': entry[cm.country_e] = country if province is not None: entry[cm.province_e] = province if city is not None: entry[cm.city_e] = city entry[cm.addr_e] = addr m = re.search(ur'<span class="type">Phone</span>\s*<p>(.+?)</p>', body, re.S) if m is not None: entry[cm.tel] = m.group(1) m = re.search(ur'<span class="type">Opening hours</span>\s*<p>(.+?)</p>', body, re.S) if m is not None: entry[cm.hours] = cm.reformat_addr(m.group(1)) m = re.search(ur'<span class="type">You can find</span>\s*<p>(.+?)</p>', body, re.S) if m is not None: entry[cm.store_type] = cm.reformat_addr(m.group(1)) m = re.search(ur'google.maps.LatLng\(\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)', body, re.S) entry[cm.lat] = string.atof(m.group(1)) entry[cm.lng] = string.atof(m.group(2)) gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return [entry]
def fetch_store_details(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching store details: %s' % url, 'debeers_log.txt') return [] start = body.find(u'<div class="store-details">') if start == -1: cm.dump('Error in fetching store details: %s' % url, 'debeers_log.txt') body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m = re.search(ur'<h2 class="store-name">(.+?)</h2>', body) if m is not None: entry[cm.name_e] = m.group(1).strip() m_addr = re.search(ur'<div class="store-address">(.+?)</div>', body, re.S) if m_addr is not None: addr = m_addr.group(1).strip() pat_tel = re.compile(ur'<p class="store-phone">(.+?)</p>', re.S) pat_fax = re.compile(ur'<p class="store-fax">(.+?)</p>', re.S) pat_email = re.compile(ur'<p class="store-email">(.+?)</p>', re.S) m = re.search(pat_tel, addr) if m is not None: entry[cm.tel] = m.group(1).strip() m = re.search(pat_fax, addr) if m is not None: entry[cm.fax] = m.group(1).strip() m = re.search(pat_email, addr) if m is not None: entry[cm.email] = m.group(1).strip() addr = re.sub(pat_tel, '', addr) addr = re.sub(pat_fax, '', addr) addr = re.sub(pat_email, '', addr) addr = re.sub(u'<h3>.+?</h3>', '', addr) addr = cm.reformat_addr(addr) entry[cm.addr_e] = addr country, province, city = gs.addr_sense(addr) if country is not None: entry[cm.country_e] = country if province is not None: entry[cm.province_e] = province if city is not None: entry[cm.city_e] = city m = re.search(ur'<div class="store-hours">(.+?)</div>', body, re.S) if m is not None: entry[cm.hours] = cm.reformat_addr(m.group(1)) gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), 'benetton_log.txt', False) db.insert_record(entry, 'stores') return [entry]