def fetch_uk(body, data): start = body.find(u'<div class="fableft">') if start == -1: print 'Error in finding %s stores' % data['name'] return [] body, start, end = cm.extract_closure(body[start:], ur'<div\b', ur'</div>') if end == 0: print 'Error in finding %s stores' % data['name'] return [] store_list = [] for m in re.findall(ur'<div>\s*(.+?)\s*</div>', body, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['name'] addr_list = re.findall(ur'<p>\s*(.+?)\s*</p>', m) tel = cm.extract_tel(addr_list[-1]) if tel != '': entry[cm.tel] = tel del addr_list[-1] if data['name'] == 'AUSTRALIA': country, province, city = gs.addr_sense(', '.join(addr_list), data['name']) if city is not None: entry[cm.city_e] = city if province is not None: entry[cm.province_e] = province else: city = addr_list[-2].strip().upper() entry[cm.city_e] = city ret = gs.look_up(city, 3) if ret is not None and ret['country']['name_e'] == gs.look_up( 'UK', 1)['name_e']: entry[cm.city_e] = ret['name_e'] entry[cm.zip_code] = addr_list[-1].strip().upper() entry[cm.addr_e] = ', '.join(addr_list) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_stores(data): url = data['home_url'] try: body = cm.post_data(url, {'lz_sf': data['province'], 'lz_sx': data['city']}) except Exception: cm.dump('Error in fetching stores: %s, %s, %s' % (url, data['province'], data['city']), 'samsonite_log.txt') dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] start = body.find(u'搜索结果') if start == -1: cm.dump('Error in fetching stores: %s, %s, %s' % (url, data['province'], data['city']), 'samsonite_log.txt') return [] body = body[start + 4:] store_list = [] for m in re.findall(ur'</script>\s*(\S+)\s*</span>', body, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.name_e] = m.strip() entry[cm.addr_e] = m.strip() entry[cm.city_c] = data['city'] ret = gs.look_up(data['city'], 3) if ret is not None: entry[cm.city_e] = cm.extract_city(ret['name_e'])[0] if ret['province'] != '': entry[cm.province_e] = ret['province']['name_e'] entry[cm.province_c] = data['province'] ret = gs.look_up(data['province'], 2) if ret is not None: entry[cm.province_e] = ret['name_e'] entry[cm.country_e] = u'CHINA' gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), 'benetton_log.txt', False) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_countries(data, logger): url = data['url'] param = {'action': 'getCountriesByContinent', 'idContinent': data['continent_id'], 'filter': 'clothing;lacoste%20l!ve'} try: body = cm.get_data(url, param) q = pq(body) except Exception: # cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name) return [] raw = json.loads(body)['root']['DATA']['countries'] results = [] for c in raw: d = data.copy() code = c['country']['iso2'] d['country_id'] = c['country']['id'] d['country_code'] = code ret = gs.look_up(code, 1) if ret is not None: uid = gs.country_map['lookup'][code] gs.country_map['data'][uid]['iso3'] = c['country']['iso3'] gs.country_map['lookup'][c['country']['iso3']] = uid results.append(d) return results
def fetch_stores(data): body = data['body'] start = body.find(u'<ul class="storelist storelist_%s' % data['code']) if start == -1: cm.dump('Error in finding stores for %s' % data['code']) return [] body = cm.extract_closure(body[start:], ur'<ul\b', ur'</ul>')[0] store_list = [] for m in re.findall(ur'<li class="sitem">(.+?)</li>', body, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m1 = re.search(ur'<h3>(.+?)</h3>', m) if m1 is not None: entry[cm.name_c] = m1.group(1).strip() m1 = re.search(ur'<div class="addr">(.+?)</div>', m) if m1 is not None: entry[cm.addr_e] = m1.group(1).replace(u'地址:', '').replace(u'地址:', '').strip() m1 = re.search(ur'<div class="tel">(.+?)</div>', m) if m1 is not None: entry[cm.tel] = m1.group(1).replace(u'电话:', '').replace(u'电话:', '').strip() entry[cm.city_c] = data['city'] ret = gs.look_up(data['city'], 3) if ret is not None: entry[cm.city_e] = ret['name_e'] entry[cm.city_c] = ret['name_c'] if ret['province'] != '': entry[cm.province_e] = ret['province']['name_e'] entry[cm.country_e] = u'CHINA' gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), 'canali_log.txt') db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_countries(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] # 处理重定向 m = re.search('<h2>Object moved to <a href="(.+?)">', html) if m is not None: data['url'] = data['host'] + m.group(1) return fetch_countries(data) m = re.search('<span class="country">Choose a country</span>', html) if m is None: return [] sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>') if end == 0: return [] country_list = [] for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub): d = data.copy() country_e = cm.html2plain(m[1]).strip().upper() ret = gs.look_up(country_e, 1) if ret is not None: country_e = ret['name_e'] d['country_e'] = country_e d['province_e'] = '' d['url'] = data['host'] + m[0] country_list.append(d) return country_list
def fetch_stores(data): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) code = data['country_code'] if gs.look_up(code, 1) is None: entry[cm.country_e] = cm.html2plain(data['country']).strip().upper() else: entry[cm.country_e] = code entry[cm.name_e] = data['store_name'] entry[cm.city_e] = cm.extract_city(data['city'])[0] entry[cm.lat] = data['lat'] if data['lat'] is not None else '' entry[cm.lng] = data['lng'] if data['lng'] is not None else '' m = re.search(ur'data-boutique\s*=\s*"%s"' % data['store_id'], data['content']) sub = data['content'][m.end():] m1 = re.search(ur'<li class="isDistributeur[^<>]+>(.+?)</li>', sub) if m1 is not None: entry[cm.store_class] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<li class="place-title[^<>]+>(.+?)</li>', sub, re.S) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<li class="contacts[^<>]+>(.+?)</li>', sub, re.S) if m1 is not None: m2 = re.search(ur'<a class="popupLaunch" href="([^"]+)"', m1.group(1)) if m2: entry = fetch_details(data, m2.group(1), entry) m2 = re.search(ur'<p>(.+?)</p>', m1.group(1), re.S) if m2: ct_list = tuple( tmp.strip() for tmp in cm.reformat_addr(m2.group(1)).split(',')) entry[cm.tel] = cm.extract_tel(ct_list[0]) if len(ct_list) > 1: entry[cm.email] = ct_list[1].strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') return tuple(entry)
def get_countries(data): """ 返回国家列表 :rtype : [{'country_code':**, 'country':**}, ...] :param data: :return: """ url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } cm.dump(dump_data) return [] pat = '<option value="0">Choose a country</option>' splits = [m.start() for m in re.finditer(pat, html)] splits.append(-1) sub_html = [] for i in xrange(len(splits) - 1): sub_html.append(html[splits[i]:splits[i + 1]]) # 1:州信息 # s_map = [{'state_code':m[0], 'state':m[1].strip} state_list = [] for m in re.findall(ur'<option value="(.+?)"\s*?>(.+?)</option>', sub_html[0][len(pat):]): code = m[0].strip().upper() state = m[1].strip().upper() ret = gs.look_up(state, 2) if ret is not None: # state_list.append({'state': ret[0]['province_e'], 'state_code': ret[0]['state_code']}) state_list.append({ 'state': ret['name_e'], 'state_code': ret['code'] }) else: # state其实是写成是代码 for key in gs.province_map['data']: state = gs.province_map['data'][key] if state['code'] == code: state = state['name_e'] state_list.append({'state': state, 'state_code': code}) break
def fetch_stores(data): """ 获得商店信息 :param data: """ url = data['url'] try: info = json.loads(cm.get_data(url, {'tskay': data['key_term']})) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] raw_list = info['shops'] store_list = [] for s in raw_list: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.city_e] = s['city'].strip().upper() entry[cm.country_e] = data['country_e'].strip().upper() entry[cm.name_e] = s['name'].strip() addr = s['address'] entry[cm.addr_e] = addr terms = addr.split(',') if len(terms) > 1 and entry[cm.city_e] in terms[-1].strip().upper(): country = entry['country_e'] tmp = gs.look_up(country, 1) if tmp is not None: country = tmp['name_e'] if country == 'JAPAN': # 日本邮编 m = re.search(ur'\d{3,}[ -\.]+?\d{3,}', terms[-1]) if m is not None: entry[cm.zip_code] = m.group(0) else: m = re.search(ur'\d{4,}', terms[-1]) if m is not None: entry[cm.zip_code] = m.group(0) entry[cm.tel] = s['tel'] entry[cm.fax] = s['fax'] entry[cm.email] = s['email'] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) # db.insert_record(entry, 'stores') return store_list
def fetch_countries(data, logger): url = data['new_home_url'] q = pq(url=url) country_list = [] for item in q('#country option[value!=""]'): d = data.copy() d['country_id'] = item.attrib['value'] country_e = item.text.strip().upper() ret = gs.look_up(country_e, 1) if ret is not None: country_e = ret['name_e'] d['country_e'] = country_e country_list.append(d) return country_list
def fetch_cities(data): ret = gs.look_up(data['country'], 1) if ret is None: return [] country = ret['name_e'] city_map = gen_city_map() results = [] if country in city_map: for city in city_map[country]: d = data.copy() d['country'] = country d['city'] = city d['city_lat'] = city_map[country][city]['lat'] d['city_lng'] = city_map[country][city]['lng'] results.append(d) return results
def fetch_cities(data, logger): ret = gs.look_up(data['country_code'].upper(), 1) if ret is None: return () country = ret['name_e'] city_map = data['city_map'] results = [] if country in city_map: for city in city_map[country]: d = data.copy() d['country'] = country d['city'] = city d['city_lat'] = city_map[country][city]['lat'] d['city_lng'] = city_map[country][city]['lng'] results.append(d) return tuple(results)
def fetch_countries(data): url = data['home_url'] try: html = cm.get_data(url, {'brand': 'oasis', 'countryISO': 'GB'}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] country_list = [] for m in re.findall(ur'<option value="([A-Z]{2})">(.+?)</option>', html): d = data.copy() d['country_code'] = m[0] country = m[1].strip().upper() ret = gs.look_up(country, 1) if ret is not None: country = ret['name_e'] d['country_e'] = country country_list.append(d)
def fetch_countries(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] start = html.find('<select name="country" id="inp-country"') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<select\b', ur'</select>') if end == 0: return [] country_list = [] for m in re.findall(ur'<option value="([A-Z]{2})">(.*?)</option>', sub): d = data.copy() d['country_code'] = m[0] d[cm.country_c] = m[1].strip() for key in [cm.country_e, cm.continent_e, cm.continent_c]: d[key] = '' ret = gs.look_up(d['country_code'], 1) if ret is not None: d[cm.country_e] = ret['name_e'] d[cm.country_c] = ret['name_c'] d[cm.continent_c] = ret['continent']['name_c'] d[cm.continent_e] = ret['continent']['name_e'] country_list.append(d)
def fetch_states(data): print '(%s/%d) Found country: %s' % (data['brandname_e'], data['brand_id'], data['country_e']) url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] # 处理重定向 m = re.search('<h2>Object moved to <a href="(.+?)">', html) if m is not None: data['url'] = data['host'] + m.group(1) return fetch_countries(data) m = re.search('<span class="state">Choose a state/provence</span>', html) if m is None: return [] sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>') if end == 0: return [] state_list = [] for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub): province_e = cm.html2plain(m[1]).strip().upper() if data['country_e'] == 'CHINA': # 去掉省中间的空格 province_e = province_e.replace(' ', '') ret = gs.look_up(province_e, 2) if ret is not None: province_e = ret['name_e'] d = data.copy() d['province_e'] = province_e d['url'] = data['host'] + m[0] state_list.append(d) return state_list
def fetch_cities(data, logger): url = data['post_city'] try: html = cm.post_data(url, {'country': data['country_id']}) q = pq(html) except Exception: logger.error(unicode.format(u'Error occured for country:{0}', data['country_id'])) # print 'Error occured: %s' % url # dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} # cm.dump(dump_data) return () city_list = [] for item in q('#cities option[value!="0"]'): d = data.copy() city_e = cm.html2plain(item.text).strip().upper() ret = gs.look_up(city_e, 3) if ret is not None: city_e = ret['name_e'] d['city_e'] = city_e city_list.append(d) return city_list
def fetch_stores(data): """ 获得门店信息 :param data: :return: """ url = data['url'] try: html = common.get_data(data['url']) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': common.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } common.dump(dump_data) return [] # 第二个<ul>...</ul> start = 0 for i in xrange(2): start = html.find('<ul>', start) if start == -1: return [] start += len('<ul>') end = html.find('</ul>', start) html = html[start:end] store_list = [] for m in re.findall(ur'<li>(.+?)</li>', html, re.S): entry = common.init_store_entry(brand_id, brandname_e, brandname_c) entry[common.store_type] = 'FASHION' m1 = re.findall(ur'<h2>(.+?)</h2>', m) if len(m1) > 0: entry[common.name_e] = common.reformat_addr(m1[0]) # Google Maps网址 m1 = re.findall(ur'href="(https://maps.google.com/maps[^\s]+?)"', m) if len(m1) > 0: entry[common.url] = m1[0] addr = common.reformat_addr('\n\r'.join( [m1 for m1 in re.findall(ur'<p>(.+?)</p>', m)])) entry[common.addr_e] = addr terms = addr.split(',') # 是否所有的geosensing都未命中? hit_flag = False # 最后一项是否为国家 country = '' ret = gs.look_up(terms[-1], 1) if ret is not None: entry[common.country_e] = ret['name_e'] country = ret['name_e'] terms = terms[:-1] hit_flag = True # 查找州和城市 m = re.match(ur'.*(\d{5,})', terms[-1]) zip_cdt = '' if m is not None: zip_cdt = m.group(1) tmp = re.sub(ur'\d{5,}', '', terms[-1]).strip().upper() ret = gs.look_up(terms[-1], 2) if ret is not None: entry[common.province_e] = ret['name_e'] entry[common.zip_code] = zip_cdt terms = terms[:-1] hit_flag = True ret = gs.look_up(terms[-1], 3) if ret is not None: entry[common.city_e] = ret['name_e'] entry[common.zip_code] = zip_cdt hit_flag = True if not hit_flag: # 所有都未命中,输出: common.write_log('Failed in geosensing: %s' % addr) gs.field_sense(entry) print '%s Found store: %s, %s (%s, %s)' % ( brandname_e, entry[common.name_e], entry[common.addr_e], entry[common.country_e], entry[common.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry)
continue sub1 = cm.extract_closure(sub[start:], ur'<div\b', ur'</div>')[0] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.store_type] = store_type m1 = re.search(ur'<p class="store-item-name">(.+?)</p>', sub1, re.S) if m1 is not None: entry[cm.name_e] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<p class="store-item-adress">(.+?)</p>', sub1, re.S) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) entry[cm.tel] = cm.extract_tel(sub1) ret = gs.look_up(data['country_code'], 1) if ret is not None: entry[cm.country_e] = ret['name_e'] ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None: entry[cm.province_e] = ret[1] if ret[2] is not None: entry[cm.city_e] = ret[2] else: entry[cm.city_e] = data['city'].strip().upper() if entry[cm.name_e] in latlng_map: tmp = latlng_map[entry[cm.name_e]] entry[cm.lat] = tmp['lat'] entry[cm.lng] = tmp['lng']
def fetch(level=1, data=None, host='localhost', port=3306, user='******', passwd='123456'): tot = 0 start = 0 store_list = [] data = {'q': '*:*', 'pt': '0,0', 'd': 100000, 'start': 0, 'rows': 100} # data = {'q': '*:*', 'pt': '36.778261,-119.417932', 'd': 50, 'start': 0, 'rows': 100} db = cm.StoresDb() db.connect_db(host=host, port=port, user=user, passwd=passwd, db='brand_stores') db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id)) while True: cm.dump('Fetching from %d' % start, 'triumph_log.txt') try: data['start'] = start html = cm.get_data(url, data) raw_list = json.loads(html) if tot == 0: tot = raw_list['response']['numFound'] cm.dump('Found: %d' % tot, 'triumph_log.txt') raw_list = raw_list['response']['docs'] except Exception: cm.dump('Error occured while fetching from %d' % data['start'], 'triumph_log.txt') dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } cm.dump(dump_data) return [] idx = 0 if len(raw_list) < data['rows'] and start + len(raw_list) < tot: cm.dump('Cooling down...', 'triumph_log.txt') time.sleep(5) continue for v in raw_list: entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) cm.update_entry( entry, { cm.store_type: v['class'], cm.zip_code: v['zip'], cm.tel: v['phone'], cm.fax: v['fax'], cm.url: v['web'], cm.email: v['email'], cm.hours: v['opening_hours'] }) entry[cm.name_e] = cm.reformat_addr(v['name']) entry[cm.city_e], tmp = cm.extract_city(v['city']) if not re.search(ur'\d', entry[cm.zip_code]) and tmp != '': entry[cm.zip_code] = tmp if v['location'] != '': terms = v['location'].split(',') cm.update_entry(entry, { cm.lat: string.atof(terms[0]), cm.lng: string.atof(terms[1]) }) addr = v['address'] if v['address2'] != '': addr += ', ' + v['address2'] entry[cm.addr_e] = cm.reformat_addr(addr) ret = gs.look_up(v['country'], 1) if ret is not None: entry[cm.country_e] = ret['name_e'] else: cm.dump('Error in looking up country %s' % v['country'], 'triumph_log.txt') gs.field_sense(entry) cm.dump( '(%s / %d) Found store at %d: %s, %s (%s, %s, %s)' % (brandname_e, brand_id, start + idx, entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e]), 'triumph_log.txt') store_list.append(entry) db.insert_record(entry, 'stores') idx += 1 if tot - start <= len(raw_list): break else: start += len(raw_list)
def fetch(level=1, data=None, user='******', passwd=''): db = common.StoresDb() db.connect_db(user=user, passwd=passwd) db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id)) data = { 's': -89, 'w': -179, 'n': 89, 'e': 179, 'chinese': 0, 'repair': 1, 'store': 1 } try: html = common.get_data(url_init, data) except Exception: print 'Error occured in getting the list of countries: %s' % url_init dump_data = { 'level': 1, 'time': common.format_time(), 'data': { 'data': url_init }, 'brand_id': brand_id } common.dump(dump_data) return [] store_list = [] store_map = json.loads(html) tot = 0 while True: # 得到{'uid':entry}的字典 tmp = store_map['lists'] # 是否有'more' flag = False if 'has_key' not in dir(tmp): raw_stores = {} for item in tmp: if 'more' in item: flag = item['more'] else: raw_stores[item['nid']] = item else: raw_stores = tmp for k in tmp: if 'more' in tmp[k]: flag = tmp[k]['more'] break # 分析raw_stores for k in raw_stores: s = raw_stores[k] if 'more' in s: flag = s['more'] else: entry = common.init_store_entry(brand_id, brandname_e, brandname_c) if s['country'] is not None: country_c = s['country'].strip().upper() ret = gs.look_up(country_c, 1) if ret is not None: entry[common.country_e] = ret['name_e'] entry[common.country_c] = ret['name_c'] else: if common.is_cjk(country_c): entry[common.country_c] = country_c else: entry[common.country_e] = country_c if s['address'] is not None: addr = common.reformat_addr(s['address']) if common.is_cjk(addr): entry[common.addr_c] = addr else: entry[common.addr_e] = addr city = s['city'] if city is not None: city = city.strip().upper() ret = gs.look_up(city, 3) if ret is not None: entry[common.city_c] = ret['name_c'] entry[common.city_e] = ret['name_e'] else: if common.is_cjk(city): entry[common.city_c] = city else: entry[common.city_e] = city entry[common.city_e] = common.extract_city( entry[common.city_e])[0] if s['email'] is not None: entry[common.email] = s['email'] if s['fax'] is not None: entry[common.fax] = s['fax'] if s['latitude'] is not None: entry[common.lat] = string.atof(s['latitude']) if s['longitude'] is not None: entry[common.lng] = string.atof(s['longitude']) if s['phone'] is not None: entry[common.tel] = s['phone'] if s['postal_code'] is not None: entry[common.zip_code] = s['postal_code'] if s['title'] is not None: name = s['title'] if common.is_cjk(name): entry[common.name_c] = name else: entry[common.name_e] = name if s['operating_hours'] is not None: entry[common.hours] = s['operating_hours'] if s['url'] is not None: entry[common.url] = host + s['url'] gs.field_sense(entry) print '%s: Found store: %s, %s (%s, %s)' % ( brandname_e, entry[common.name_e], entry[common.addr_e], entry[common.country_e], entry[common.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry) if flag: tot += len(store_map['lists']) - 1 data['offset'] = tot store_map = json.loads(common.get_data(url_more, data)) continue else: tot += len(store_map['lists']) break print 'Found a total of %d stores.' % tot db.disconnect_db() return store_list
try: body = cm.get_data(url, param) except Exception, e: cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name) return [] results = [] for c in json.loads(body)['geoEntityLocaleList']: d = data.copy() d['country_id'] = string.atoi(c['geoEntity']['id']) d['country'] = cm.html2plain(c['geoEntity']['name']).strip() results.append(d) for item in results: if gs.look_up(item['country'].upper(), 1) is None: print 'Cannot look up %s' % item['country'] return results def fetch_states(data): url = data['host'] + data['geo_url'] param = {'lang': 'EN_US', 'geo_id': data['country_id']} try: body = cm.get_data(url, param) except Exception, e: cm.dump('Error in fetching states: %s, %s' % (url, param), log_name) return [] results = []
cm.dump('Error in fetching stores: %s' % url, log_name) return () sub = cm.extract_closure(body[m.start():], ur'<div\b', ur'</div>')[0] store_list = [] for city_sub in re.findall(ur'<tr>(.+?)</tr>', sub, re.S): m = re.search(ur"<td[^<>]+class='shopLocation'\s*>([^<>]+)</td>", city_sub) city_c = m.group(1).strip() city_e = '' if city_c == u'吉隆坡': city_e = 'KUALA LUMPUR' elif city_c == u'槟城': city_e = 'PENANG' else: ret = gs.look_up(city_c, 3) if ret: city_e = ret['name_e'] city_c = ret['name_c'] m = re.search(ur"<td class='storeName'>(.+?)</td>", city_sub, re.S) if not m: continue for name in (tmp.strip() for tmp in cm.reformat_addr(m.group(1)).split(',')): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country_code'] entry[cm.city_e], entry[cm.city_c] = city_e, city_c entry[cm.name_e] = name
else: body = data['body'] store_list = [] city = data['city'] if city == '': m = re.search(ur'<span id="m_sthead"\s*>(.+?)</span>', body) if m is not None: city = cm.reformat_addr(m.group(1)) city = city.replace(u'市', u'').strip() for m in re.finditer(ur'<span id="m_stname"[^<>]*>(.+?)</span>', body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] entry[cm.province_c] = data['province'] ret = gs.look_up(data['province'], 2) if ret is not None: entry[cm.province_e] = ret['name_e'] entry[cm.city_c] = city ret = gs.look_up(city, 3) if ret is not None: entry[cm.city_e] = ret['name_e'] entry[cm.name_e] = cm.reformat_addr(m.group(1)) m1 = re.search(ur'<span id="m_stlist"[^<>]*>(.+?)</span>', body[m.end():]) if m1 is not None: addr_list = cm.reformat_addr(m1.group(1)).split(',') tel = cm.extract_tel(addr_list[-1]).strip() if tel != '':
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] store_list = [] for m in re.finditer(ur'<item id="\d+">', body): sub = cm.extract_closure(body[m.start():], ur'<item\b', ur'</item>')[0] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m1 = re.search(ur'<country>([^<>]+)</country>', sub) if m1 is not None: tmp = m1.group(1).split('/') for v in tmp: ret = gs.look_up(v.strip().upper(), 1) if ret is not None: entry[cm.country_e] = ret['name_e'] break m1 = re.search(ur'<city>([^<>]+)</city>', sub) if m1 is not None: val = cm.reformat_addr(m1.group(1)) if entry[cm.country_e] == 'UNITED STATES': tmp_list = tuple(tmp.strip() for tmp in cm.reformat_addr(val).strip(',')) if len(tmp_list) == 2: if re.search('[A-Z]{2}', tmp_list[1]): entry[cm.province_e] = tmp_list[1] entry[cm.city_e] = cm.extract_city(m1.group(1))[0] m1 = re.search(ur'<brands>([^<>]+)</brands>', sub) if m1 is not None: tmp = m1.group(1).split('/') brand_list = [] for v in tmp: if v.strip() != '': brand_list.append(v) entry[cm.store_type] = ', '.join(brand_map[key] for key in brand_list) m1 = re.search(ur'<name>([^<>]+)</name>', sub) if m1 is not None: entry[cm.name_e] = m1.group(1).strip() m1 = re.search(ur'<address>([^<>]+)</address>', sub) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<tel>([^<>]+)</tel>', sub) if m1 is not None: entry[cm.tel] = m1.group(1).strip() m1 = re.search(ur'sll=(-?\d+\.\d+),(-?\d+\.\d+)', sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) entry[cm.lng] = string.atof(m1.group(2)) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None: entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump( '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_store_details(data): url = data['url'] try: body = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) ret = gs.look_up(data['country'], 1) if ret is not None: entry[cm.country_e] = ret['name_e'] m = re.search(ur'<span class="type">Address</span>\s*<p>(.+?)</p>', body, re.S) if m is not None: addr = cm.reformat_addr(m.group(1)) country, province, city = gs.addr_sense(addr) if country is not None and entry[cm.country_e] == '': entry[cm.country_e] = country if province is not None: entry[cm.province_e] = province if city is not None: entry[cm.city_e] = city entry[cm.addr_e] = addr m = re.search(ur'<span class="type">Phone</span>\s*<p>(.+?)</p>', body, re.S) if m is not None: entry[cm.tel] = m.group(1) m = re.search(ur'<span class="type">Opening hours</span>\s*<p>(.+?)</p>', body, re.S) if m is not None: entry[cm.hours] = cm.reformat_addr(m.group(1)) m = re.search(ur'<span class="type">You can find</span>\s*<p>(.+?)</p>', body, re.S) if m is not None: entry[cm.store_type] = cm.reformat_addr(m.group(1)) m = re.search(ur'google.maps.LatLng\(\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)', body, re.S) entry[cm.lat] = string.atof(m.group(1)) entry[cm.lng] = string.atof(m.group(2)) gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return [entry]
def fetch(level=1, data=None, user='******', passwd=''): # Walk from the root node, where level == 1. if data is None: data = { 'url': 'http://cms.destinationkors.com/store/get', 'brand_id': 10259, 'brandname_e': u'Michael Kors', 'brandname_c': u'迈克.柯尔' } type_desc = ['Collection Boutique', 'Lifestyle', 'Outlet'] global db db = cm.StoresDb() db.connect_db(user=user, passwd=passwd) db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', data['brand_id'])) store_list = [] url = data['url'] try: html = cm.get_data(url).decode('unicode_escape') start = html.find('[') if start == -1: return [] js = json.loads(html[start:]) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] for s in js: entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.store_type] = type_desc[string.atoi(s['store_type']) - 1] name = s['name'].strip() if s['name2'].strip() != '': name += ', ' + s['name2'].strip() entry[cm.name_e] = name addr = [] for i in xrange(3): tmp = s['address%d' % (i + 1)].strip() if tmp != '': addr.append(tmp) entry[cm.addr_e] = ', '.join(addr) entry[cm.city_e] = cm.extract_city(s['city'])[0] country = s['country'] ret = gs.look_up(country, 1) if ret is not None: country = ret['name_e'] entry[cm.country_e] = country state = s['state'].strip().upper() if country == 'UNITED STATES' and state != '': ret = gs.look_up(state, 2) if ret is not None: entry[cm.province_e] = ret['name_e'] else: entry[cm.province_e] = state entry[cm.zip_code] = s['zip'] entry[cm.tel] = s['phone'] entry[cm.hours] = s['hours'] entry[cm.lat] = string.atof(s['latitude']) entry[cm.lng] = string.atof(s['longitude']) gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores') db.disconnect_db() return store_list
param = {'country': data['country_tag'], 'city': data['city_tag'], 'adutl': ' 01', 'kids': ' 02', 'undercolor': ' 06', 'togetmap': 'mapdata'} try: body = cm.get_data(url, param) except Exception, e: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return () body = cm.extract_closure(body, ur'\(', ur'\)')[0][1:-1] sub = json.loads(body)['data']['xml_dt'] store_list = [] for m in re.findall(ur'<marker[^<>]+/\s*>', sub): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country'] entry[cm.city_e] = cm.extract_city(data['city'])[0] ret = gs.look_up(entry[cm.country_e], 1) if ret and ret['name_e'] == 'UNITED STATES': tmp_list = tuple(tmp.strip() for tmp in cm.reformat_addr(data['city']).strip(',')) if len(tmp_list) == 2: if re.search('[A-Z]{2}', tmp_list[1]): entry[cm.province_e] = tmp_list[1] # m1 = re.search(ur'name\s*=\s*"([^"]+)"', m) # entry[cm.name_e] = m1.group(1) if m1 else '' m1 = re.search(ur'address\s*=\s*"([^"]+)"', m) if m1: addr = re.sub(ur'\.textmap\{.*\}', '', cm.reformat_addr(m1.group(1))) addr_list = [tmp.strip() for tmp in addr.split(',')] tel = cm.extract_tel(addr_list[-1]) if tel != '':
for m1 in re.findall( ur'<span itemprop="addressLocality">(.*?)</span>', m): if len(m1.strip()) > 0: city = cm.extract_city(m1)[0] break for m1 in re.findall( ur'<span itemprop="addressCountry">(.*?)</span>', m): if len(m1.strip()) > 0: country = m1 break entry[cm.zip_code] = zip_code # 没有上述标签的情况 if street_addr == '': tmp = cm.reformat_addr(m) terms = tmp.split(',') ret = gs.look_up(terms[-1], 1) if ret is not None: # t2 = cm.geo_translate(terms[-1]) # if len(t2) != 0: # 这是一个国家 # 把最后的国家项分离出来 street_addr = ', '.join(terms[:-1]) entry[cm.addr_e] = cm.reformat_addr(street_addr) entry[cm.country_c] = ret['name_c'] entry[cm.country_e] = ret['name_e'] entry[cm.continent_c] = ret['continent']['name_c'] entry[cm.continent_e] = ret['continent']['name_e'] else: if cm.is_cjk(tmp): entry[cm.addr_c] = tmp else:
def get_frag_stores(data): try: html = common.get_data(data['url'], { 'country': data['country'], 'city_postal': '', 'page': data['page'] }) except Exception: print 'Error occured: %s' % url_fragrance dump_data = { 'level': 1, 'time': common.format_time(), 'data': { 'url': url_fragrance }, 'brand_id': brand_id } common.dump(dump_data) return [], False print 'PARSING PAGE: %d' % data['page'] start = html.find('<section id="content" class="content">') if start == -1: return [], False html, start, end = common.extract_closure(html[start:], ur'<section\b', ur'</section>') if end == 0: return [], False # 找到总页面数量 tot_page = 0 start = html.find('<div class="pagination">') if start != -1: pagination, start, end = common.extract_closure( html[start:], ur'<div\b', ur'</div>') m = re.findall(ur'<a href=".*?" class="page">(\d+)</a>', pagination) if len(m) > 0: tot_page = string.atoi(m[-1]) # 开始寻找门店 store_list = [] for m in re.findall(ur'<li>(.*?)</li>', html, re.S): entry = common.init_store_entry(brand_id, brandname_e, brandname_c) entry[common.store_type] = 'FRAGRANCE' m1 = re.findall(ur'<h2>(.+?)</h2>', m) if len(m1) > 0: entry[common.name_e] = common.html2plain(m1[0].strip()) m1 = re.findall(ur'href="(.+?)"', m) if len(m1) > 0: entry[common.url] = m1[0] addr = common.reformat_addr(','.join(re.findall(ur'<p>(.+?)</p>', m))) entry[common.addr_e] = addr terms = addr.split(', ') ret = gs.look_up(terms[-1], 1) if ret is not None: entry[common.country_e] = ret['name_e'] if len(terms) >= 2: m1 = re.match(ur'.*?(\d+)\s+(.*)', terms[-2]) if m1 is not None: ret = gs.look_up(m1.group(2).strip().upper(), 3) if ret is not None: entry[common.city_e] = ret['name_e'] else: if len(re.findall('(\S+)', m1.group(2).strip().upper())) == 1 and \ len(re.findall('(\d+)', m1.group(2).strip().upper())) == 0: entry[common.city_e] = m1.group(2).strip().upper() entry[common.zip_code] = m1.group(1).strip() gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( brandname_e, brand_id, entry[common.name_e], entry[common.addr_e], entry[common.country_e], entry[common.continent_e]) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_stores(data): url = data['url'] try: html, cookie_map = cm.get_data_cookie(url) except Exception: print 'Error occured in getting country list: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] print 'SLEEPING>>>>' time.sleep(5) m = re.search('http://www.ninewest.com/on/demandware.store/Sites-ninewest-Site/default/Stores-Find/C\d{10}', html) if m is None: return [] url = m.group(0) cookie_map_new = {} for key in cookie_map: if 'dwpersonalization_' in key or key == 'sr_token': continue cookie_map_new[key] = cookie_map[key] cookie_map_new['invited_visitor_22225'] = '1' cookie_map = cookie_map_new try: html = cm.post_data(url, {'dwfrm_storelocator_startaddress': 'kingman', 'dwfrm_storelocator_maxDistance': 30.00, 'dwfrm_storelocator_outlet': 'true', 'dwfrm_storelocator_retail': 'true', 'dwfrm_storelocator_optical': 'true', 'dwfrm_storelocator_eyewear': 'true', 'dwfrm_storelocator_apparel': 'true', 'dwfrm_storelocator_attire': 'true', 'dwfrm_storelocator_department': 'true', 'dwfrm_storelocator_IsMensFootwear': 'true', 'dwfrm_storelocator_IsRRR': 'true', 'dwfrm_storelocator_IsRRNY': 'true', 'dwfrm_storelocator_IsRRS': 'true', 'dwfrm_storelocator_wholesale': 'true', 'dwfrm_storelocator_bba': 'true', 'dwfrm_storelocator_ba': 'true', 'dwfrm_storelocator_search.x': 0, 'dwfrm_storelocator_search.y': 0, 'dwfrm_storelocator_countryCode': 'US', 'dwfrm_storelocator_postalCode': '67068', 'dwfrm_storelocator_distanceUnit': 'mi', 'dwfrm_storelocator_long': -98.117208, 'dwfrm_storelocator_lat': 37.647131, }, cookie=cookie_map) except Exception: print 'Error occured in getting country list: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for m1 in re.finditer(ur'<div class="storeColumnOne">', html): sub, start, end = cm.extract_closure(html[m1.start():], ur'<div\b', ur'</div>') if end == 0: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m2 = re.search(ur'<div class="storename">([^<>]+)</div>', sub) if m2 is not None: entry[cm.name_e] = m2.group(1).strip() addr_list = [m2 for m2 in re.findall(ur'<div class="adddressline">([^<>]+)</div>', sub)] entry[cm.addr_e] = ', '.join(addr_list) m2 = re.search(ur'<div class="citystatezip">([^<>]+)</div>', sub) if m2 is not None: tmp = cm.reformat_addr(m2.group(1)) terms = re.split('[, ]+', tmp) if len(terms) < 3: entry[cm.addr_e] = tmp else: ret = gs.look_up(terms[0], 3) if ret is not None: entry[cm.city_e] = ret['name_e'] else: entry[cm.city_e] = terms[0].strip().upper() ret = gs.look_up(terms[1], 2) if ret is not None: entry[cm.province_e] = ret['name_e'] else: entry[cm.province_e] = terms[0].strip().upper() if re.match('\s*\d{5,}\s*', terms[2]) is not None: entry[cm.zip_code] = terms[2].strip() m2 = re.search(ur'<div class="storephone">([^<>]+)</div>', sub) if m2 is not None: entry[cm.tel] = m2.group(1) cm.update_entry(entry, {'country_e': 'UNITED STATES', 'continent_e': 'NORTH AMERICA'}) gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')