def get_detailed_store(html, store_cat): store_list = [] start = 0 while True: sub_html, start, end = common.extract_closure(html, ur"<li\b", ur"</li>") if end == 0: break # 得到单个门店的页面代码 html = html[end:] entry = common.init_store_entry(brand_id, brandname_e, brandname_c) m = re.findall(ur'<div class="store-title -h3a">(.+?)</div>', sub_html) if len(m) > 0: entry[common.name_e] = common.reformat_addr(m[0]) m = re.findall(ur'<div class="store-address">(.+?)</div>', sub_html, re.S) if len(m) > 0: addr = common.reformat_addr(m[0]) # 最后一行是否为电话号码? terms = addr.split(", ") tel = common.extract_tel(terms[-1]) if tel != "": addr = ", ".join(terms[:-1]) entry[common.tel] = tel entry[common.addr_e] = addr # 获得门店类型 # store_type = [store_cat] type_html, type_start, type_end = common.extract_closure(sub_html, ur'<ul class="service-list">', ur"</ul>") if type_end != 0: store_type = [m for m in re.findall(ur'<li class="service-item">(.+?)</li>', type_html)] store_type.insert(0, store_cat) entry[common.store_type] = ", ".join(store_type) else: entry[common.store_type] = store_cat # 获得经纬度 m = re.findall(ur'data-latitude="(-?\d+\.\d+)"', sub_html) if len(m) > 0: entry[common.lat] = string.atof(m[0]) m = re.findall(ur'data-longitude="(-?\d+\.\d+)"', sub_html) if len(m) > 0: entry[common.lng] = string.atof(m[0]) entry[common.city_e] = common.extract_city(data[common.city_e])[0] entry[common.country_e] = common.reformat_addr(data[common.country_e]).strip().upper() gs.field_sense(entry) print "%s: Found store: %s, %s (%s, %s, %s)" % ( brandname_e, entry[common.name_e], entry[common.addr_e], entry[common.city_e], entry[common.country_e], entry[common.continent_e], ) db.insert_record(entry, "stores") store_list.append(entry) return store_list
def fetch_cities(data): url = data['home_url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching cities: %s' % url, 'canali_log.txt') dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] start = body.find(u'<nav class="countrySelector">') if start == -1: cm.dump('Error occured in fetching country list: %s' % url, 'canali_log.txt') body = cm.extract_closure(body[start:], ur'<nav\b', ur'</nav>')[0] results = [] for m in re.finditer(ur'<li><a href=".+?">(.+?)</a>', body): country = m.group(1).strip().upper() sub = cm.extract_closure(body[m.end():], ur'<ul\b', ur'</ul>')[0] for m1 in re.findall(ur'<li><a class=".+?" href="(.+?)">(.+?)</a></li>', sub): d = data.copy() d['country'] = country d['url'] = data['host'] + m1[0] d['city'] = m1[1].strip().upper() results.append(d)
def fetch_stores(data): param = {'action': 'getStoresFromAjax', 'country': data['country_code'], 'region': data['city'], 'collection': ''} url = data['url'] try: body = cm.post_data(url, param) except Exception: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return [] store_list = [] for m1 in re.finditer(ur'<div class="shop-type-container">', body): sub = cm.extract_closure(body[m1.start():], ur'<div\b', ur'</div>')[0] store_class = '' m2 = re.search(ur'<div class="shop-type-title">(.+?)</div>', sub, re.S) if m2 is not None: store_class = cm.reformat_addr(m2.group(1)) for m2 in re.finditer(ur'<div class="shop"', sub): store_sub = cm.extract_closure(sub[m2.start():], ur'<div\b', ur'</div>')[0] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.store_class] = store_class entry[cm.country_e] = data['country_code'] entry[cm.city_e] = cm.extract_city(data['city'])[0] m3 = re.search(ur'loadStore\((\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\)', store_sub) if m3 is not None: data['store_id'] = string.atoi(m3.group(1)) entry[cm.lat] = string.atof(m3.group(2)) entry[cm.lng] = string.atof(m3.group(3)) entry[cm.store_type] = ', '.join(get_detail(data)) m3 = re.search(ur'<div class="shop-name shop-details shop-main-name">([^<>]+)</div>', store_sub) if m3 is not None: entry[cm.name_e] = m3.group(1).strip() addr_list = [] m3 = re.search(ur'<div class="shop-street shop-details">([^<>]+)</div>', store_sub) if m3 is not None: addr_list.append(cm.reformat_addr(m3.group(1))) m3 = re.search(ur'<div class="shop-city shop-details">([^<>]+)</div>', store_sub) if m3 is not None: tmp = cm.reformat_addr(m3.group(1)) m3 = re.search(ur'(\d{4,})', tmp) if m3 is not None: entry[cm.zip_code] = m3.group(1).strip() addr_list.append(tmp) entry[cm.addr_e] = ', '.join(addr_list) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] store_list = [] for m1 in re.finditer(ur'<lignecountry\s+titre\s*=\s*"([^"]+)"', body): country = m1.group(1).strip().upper() if country == 'U.S.A.': country = 'US' sub_country = cm.extract_closure(body[m1.start():], ur'<lignecountry\b', ur'</lignecountry>')[0] for m2 in re.finditer(ur'<lignecity\s+titre\s*=\s*"([^"]+)"', sub_country): city = m2.group(1).strip().upper() sub_city = cm.extract_closure(sub_country[m2.start():], ur'<lignecity\b', ur'</lignecity>')[0] m3 = re.search(ur'<!\[CDATA\[(.+?)\]\]>', sub_city, re.S) if m3 is None: continue sub_city = m3.group(1) store_subs = re.split(ur'<\s*h2\s*>\s*LANVIN BOUTIQUE\s*<\s*/h2\s*>', sub_city) for s in store_subs: if s.strip() == '': continue m4 = re.search(ur'<p>(.+?)</p>', s, re.S) if m4 is None: continue entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = country entry[cm.city_e] = city s = m4.group(1) m4 = re.search(ur'(.+?)\n\s*\n', s, re.S) if m4 is not None: entry[cm.addr_e] = cm.reformat_addr(m4.group(1)) m4 = re.search(ur'Phone:(.+?)\n\s*\n', s, re.S) if m4 is not None: entry[cm.tel] = cm.reformat_addr(m4.group(1).strip()) m4 = re.search(ur'Boutique Hours:(.+?)\n\s*\n', s, re.S) if m4 is not None: entry[cm.hours] = cm.reformat_addr(m4.group(1).strip()) m4 = re.search(ur'Products available:(.+?)\n\s*\n', s, re.S) if m4 is not None: entry[cm.store_type] = m4.group(1).strip() m4 = re.search(ur'Email:\s*<a href="mailto:([^"]+)">', s) if m4 is not None: entry[cm.email] = m4.group(1).strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_store_details(data): url = data['host'] + data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching store details: %s' % url, log_name) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) start = body.find(ur'<h3>available in store</h3>') if start != -1: type_sub = cm.extract_closure(body[start:], ur'<ul\b', ur'</ul>')[0] entry[cm.store_type] = ', '.join( cm.html2plain(tmp).strip() for tmp in re.findall(ur'<li[^<>]*>(.+?)</li>', type_sub, re.S)) start = body.find(ur"<div class='gmap_info_box'") if start == -1: cm.dump('Error in fetching store details: %s' % url, log_name) return [] body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] raw = json.loads(cm.extract_closure(body, ur'\{', ur'\}')[0])['table'] entry[cm.name_e] = cm.html2plain(raw['name']) entry[cm.city_e] = data['city'].strip().upper() entry[cm.country_e] = data['country'].strip().upper() # entry[cm.store_type] = data['store_type'] entry[cm.addr_e] = cm.reformat_addr(raw['address']) m = re.search(re.compile(ur'phone:(.*?)fax:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.tel] = m.group(1).strip() entry[cm.fax] = m.group(2).strip() else: m = re.search(re.compile(ur'phone:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.tel] = m.group(1).strip() m = re.search(re.compile(ur'fax:(.*?)', re.I | re.S), raw['phone']) if m is not None: entry[cm.fax] = m.group(1).strip() entry[cm.hours] = raw['hours'] if raw['lat'] is not None and raw['lat'] != '': entry[cm.lat] = string.atof(raw['lat']) if raw['lng'] is not None and raw['lng'] != '': entry[cm.lat] = string.atof(raw['lng']) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None: entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') return [entry]
def parse_store(data, body=None): if body is None: url = data['url'] try: body = cm.post_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] start = body.find(ur'jQuery.extend(Drupal.settings,') latlng_map = {} if start != -1: for item in json.loads(cm.extract_closure(body[start:], ur'\{', ur'\}')[0])['getlocations']['key_1']['latlons']: latlng_map[cm.reformat_addr(item[3])] = {'lat': string.atof(item[0]), 'lng': string.atof(item[1])}
def fetch_cities(data): sql = "SELECT CityUP FROM %s WHERE Country='%s' ORDER BY CityUP ASC" % (tableid, data['country_code']) url = (u'%s?sql=%s&key=%s' % (data['data_url'], sql, queryUrlTail)).replace(u' ', u'%20') try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching cities: %s' % url, log_name) return [] results = [] for c in set([tmp[0] for tmp in json.loads(cm.extract_closure(body, ur'\{', ur'\}')[0])['rows']]): d = data.copy() d['city'] = c results.append(d)
def fetch_store_details(data): url = '%s/%d' % (data['url'], data['store_id']) try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching countries: %s' % url, log_name) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m = re.search(ur'<h1 class="with-back-option">\s*([^<>]+)\s*[<>]', body) if m is not None: entry[cm.name_e] = m.group(1).strip() start = body.find(ur'<div class="store-details">') if start != -1: sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] addr = cm.extract_closure(sub, ur'<p\b', ur'</p>')[0] m = re.search(ur'<span class="locality">([^<>]+?)</span>', addr) if m is not None: entry[cm.city_e] = m.group(1).split(',')[0].strip().upper() m = re.search(ur'<span class="postal-code">([^<>]+?)</span>', addr) if m is not None: entry[cm.zip_code] = m.group(1).strip() m = re.search(ur'<span class="country-name">([^<>]+?)</span>', addr) if m is not None: entry[cm.country_e] = m.group(1).strip().upper() entry[cm.addr_e] = cm.reformat_addr(addr) start = body.find(ur'<div class="contact">') if start != -1: sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] m = re.search(ur'<span class="tel">(.+?)</span>', sub) if m is not None: entry[cm.tel] = m.group(1).strip() m = re.search(ur'<span class="fax">(.+?)</span>', sub) if m is not None: entry[cm.fax] = m.group(1).strip() m = re.search(ur'<a href="mailto:([^"]+)">Email</a>', sub) if m is not None: entry[cm.email] = m.group(1).strip() start = body.find(ur'<h3>Opening hours</h3>') if start != -1: tmp = [] sub = cm.extract_closure(body[start:], ur'<table>', ur'</table>')[0] for m in re.findall(ur'<t[hd][^<>]*>([^<>]+)</t[hd]>', sub): tmp.append(m) entry[cm.hours] = ' '.join(tmp)
def fetch_states(data): global national_added url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching states: %s' % url, log_name) return [] national_added = False m = re.search(ur'Choose a (state|region|province)', body) if m is None: d = data.copy() d['state'] = '' return [d] body = cm.extract_closure(body[m.start():], ur'<ul>', ur'</ul>')[0] results = [] for m in re.findall(ur'<a href="([^"]+)">([^<>]+)</a>', body): d = data.copy() d['url'] = data['host'] + m[0] d['state'] = cm.html2plain(m[1]).strip().upper() results.append(d)
def fetch_cities(data): """ 城市列表 :param data: """ html = data['html'] store_list = [] while True: m = re.search(ur'<li class="expanded"><a href=".*?">(.+?)</a><br\s*?/>', html) if m is None: break html = html[m.start():] sub, start, end = cm.extract_closure(html, ur'<li\b', '</li>') html = html[end:] d = data.copy() d['html'] = sub[len(m.group(0)):-len('</li>')] terms = m.group(1).strip().upper().split(' ') if len(terms) > 1 and cm.is_chinese(terms[-1]): d['city_c'] = terms[-1].strip() terms = terms[:-1] d['city_e'] = ' '.join(terms) if d['country_e'] == 'USA': m1 = re.search(ur'([A-Z]{2})\s*-\s*(.+)', d['city_e']) if m1: d['city_e'] = m1.group(2).strip() d['province_e'] = m1.group(1).strip() print 'Processing %s' % d['city_e'] store_list.extend(fetch_stores(d)) return store_list
def fetch_color(cls, response, spider=None): sel = Selector(response) region = None if 'userdata' in response.meta: region = response.meta['userdata']['region'] else: region = response.meta['region'] colors = [] if region == 'cn': try: tmp = sel.xpath('//select[@class="select-color"]/option//a[@href]/text()').extract() if tmp: colors = [cls.reformat(val) for val in tmp] except(TypeError, IndexError): pass else: try: idx = response.body.find('var productURLs') data = json.loads(cm.extract_closure(response.body[idx:], '\{', '\}')[0].replace("'", '"')) for color_key in data: tmp = sel.xpath(str.format('//select/option[@value="{0}"]', color_key)) if not tmp: continue color_node = tmp[0] # 是否为当前选择的颜色? if color_node.xpath('@selected'): tmp = color_node.xpath('text()').extract() if tmp: colors = [cls.reformat(tmp[0])] except ValueError: pass return colors
def fetch_countries(data): url = data['url'] try: body, cookie = cm.get_data_cookie(url) except Exception: cm.dump('Error in fetching countries: %s' % url, log_name) return [] m = re.search(ur'name="form_build_id" value="(.+?)"', body) if m is None: cm.dump('Error in fetching countries: %s' % url, log_name) return [] data['form_build_id'] = m.group(1) if cookie is None: data['cookie'] = '' else: data['cookie'] = cookie start = body.find(ur'<select id="edit-countries"') if start == -1: cm.dump('Error in fetching countries: %s' % url, log_name) return [] body = cm.extract_closure(body[start:], ur'<select\b', ur'</select>')[0] results = [] for m in re.findall(ur'<option.+?value="([A-Z]{3})".*?>(.+?)</option>', body): d = data.copy() d['country_code'] = m[0] d['country'] = m[1].strip() print 'Country: %s, %s' % (d['country_code'], d['country']) results.append(d)
def fetch_stores(data): body = data['body'] start = body.find(u'<ul class="storelist storelist_%s' % data['code']) if start == -1: cm.dump('Error in finding stores for %s' % data['code']) return [] body = cm.extract_closure(body[start:], ur'<ul\b', ur'</ul>')[0] store_list = [] for m in re.findall(ur'<li class="sitem">(.+?)</li>', body, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m1 = re.search(ur'<h3>(.+?)</h3>', m) if m1 is not None: entry[cm.name_c] = m1.group(1).strip() m1 = re.search(ur'<div class="addr">(.+?)</div>', m) if m1 is not None: entry[cm.addr_e] = m1.group(1).replace(u'地址:', '').replace(u'地址:', '').strip() m1 = re.search(ur'<div class="tel">(.+?)</div>', m) if m1 is not None: entry[cm.tel] = m1.group(1).replace(u'电话:', '').replace(u'电话:', '').strip() entry[cm.city_c] = data['city'] ret = gs.look_up(data['city'], 3) if ret is not None: entry[cm.city_e] = ret['name_e'] entry[cm.city_c] = ret['name_c'] if ret['province'] != '': entry[cm.province_e] = ret['province']['name_e'] entry[cm.country_e] = u'CHINA' gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), 'canali_log.txt') db.insert_record(entry, 'stores') store_list.append(entry)
def get_frag_countries(url): # 获得国家代码 """ 获得国家的名字和代码 :rtype : [{'id':**, 'country':**}, ...] :param url: :return: """ try: html = common.get_data(url) except Exception: print 'Error occured: %s' % url_fragrance dump_data = {'level': 1, 'time': common.format_time(), 'data': {'url': url_fragrance}, 'brand_id': brand_id} common.dump(dump_data) return [], False start = html.find('<select name="country" id="id_country">') if start == -1: return [], False sub, s, e = common.extract_closure(html[start:], ur'<select\b', ur'</select>') if e == 0: return [], False return [{'id': string.atoi(m[0]), 'country': m[1].strip().upper()} for m in re.findall(ur'<option value="(\d+)".*?>(.+?)</option>', sub)]
def fetch_countries(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] # 处理重定向 m = re.search('<h2>Object moved to <a href="(.+?)">', html) if m is not None: data['url'] = data['host'] + m.group(1) return fetch_countries(data) m = re.search('<span class="country">Choose a country</span>', html) if m is None: return [] sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>') if end == 0: return [] country_list = [] for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub): d = data.copy() country_e = cm.html2plain(m[1]).strip().upper() ret = gs.look_up(country_e, 1) if ret is not None: country_e=ret['name_e'] d['country_e'] = country_e d['province_e'] = '' d['url'] = data['host'] + m[0] country_list.append(d) return country_list
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] store_list = [] for m in re.finditer(ur'<item id="\d+">', body): sub = cm.extract_closure(body[m.start():], ur'<item\b', ur'</item>')[0] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m1 = re.search(ur'<country>([^<>]+)</country>', sub) if m1 is not None: tmp = m1.group(1).split('/') for v in tmp: ret = gs.look_up(v.strip().upper(), 1) if ret is not None: entry[cm.country_e] = ret['name_e'] break m1 = re.search(ur'<city>([^<>]+)</city>', sub) if m1 is not None: val = cm.reformat_addr(m1.group(1)) if entry[cm.country_e] == 'UNITED STATES': tmp_list = tuple(tmp.strip() for tmp in cm.reformat_addr(val).strip(',')) if len(tmp_list) == 2: if re.search('[A-Z]{2}', tmp_list[1]): entry[cm.province_e] = tmp_list[1] entry[cm.city_e] = cm.extract_city(m1.group(1))[0] m1 = re.search(ur'<brands>([^<>]+)</brands>', sub) if m1 is not None: tmp = m1.group(1).split('/') brand_list = [] for v in tmp: if v.strip() != '': brand_list.append(v) entry[cm.store_type] = ', '.join(brand_map[key] for key in brand_list) m1 = re.search(ur'<name>([^<>]+)</name>', sub) if m1 is not None: entry[cm.name_e] = m1.group(1).strip() m1 = re.search(ur'<address>([^<>]+)</address>', sub) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<tel>([^<>]+)</tel>', sub) if m1 is not None: entry[cm.tel] = m1.group(1).strip() m1 = re.search(ur'sll=(-?\d+\.\d+),(-?\d+\.\d+)', sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) entry[cm.lng] = string.atof(m1.group(2)) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None: entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_countries(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] # 处理重定向 m = re.search('<h2>Object moved to <a href="(.+?)">', html) if m is not None: data['url'] = data['host'] + m.group(1) return fetch_countries(data) m = re.search('<span class="country">Choose a country</span>', html) if m is None: return [] sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>') if end == 0: return [] country_list = [] for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub): d = data.copy() country_e = cm.html2plain(m[1]).strip().upper() ret = gs.look_up(country_e, 1) if ret is not None: country_e = ret['name_e'] d['country_e'] = country_e d['province_e'] = '' d['url'] = data['host'] + m[0] country_list.append(d) return country_list
def fetch_name(cls, response, spider=None): sel = Selector(response) region = None if 'userdata' in response.meta: region = response.meta['userdata']['region'] else: region = response.meta['region'] name = None if region != 'cn': try: # 商品信息在var productJSONObject中 mt = re.search(r'var\s+productJSONObject\s*=', response.body) if mt: data = json.loads(cm.extract_closure(response.body[mt.end():], "{", "}")[0].replace(r'\"', '"').replace(r"\'", "'")) if 'productName' in data: name = cls.reformat(data['productName']) except(TypeError, IndexError, ValueError): pass else: try: tmp = sel.xpath('//div[@id="hidden_sku_value"]/input[@id="title" and @value]') if tmp: name = unicodify(tmp[0]._root.attrib['value']) except(TypeError, IndexError): pass return name
def get_store_list(data): """ 返回店铺列表,其中店铺包含国家信息。 :rtype : [{'name':'store name', 'url':'http://...', 'city':'NEW YORK', 'country:':'AUSTRALIA'}, ...] :param data: """ url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] store_list = [] for m in re.finditer(ur'<ul>\s+?<h3 class="country-name">(.+?)</h3>', html, re.S): sub, start, end = cm.extract_closure(html[m.start():], ur'<ul>', ur'</ul>') if end == 0: continue # 得到不同国家的分割 splits = [[m1.start(), m1.group(1)] for m1 in re.finditer(ur'<h3 class="country-name">(.+?)</h3>', sub)] splits.append([-1, '']) for i in xrange(len(splits) - 1): # 在同一个国家下寻找 sub1 = sub[splits[i][0]:splits[i + 1][0]] country = splits[i][1].upper() for m1 in re.findall(ur'<li>\s*?<a href="(http://us.christianlouboutin.com/us_en/storelocator/\S+?)">' ur'(.+?)</a>,(.+?)</li>', sub1): store_list.append({'name': m1[1].strip(), 'url': m1[0], 'city': m1[2].strip().upper(), 'country': country})
def fetch_countries(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] start = html.find('<select name="country" id="inp-country"') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<select\b', ur'</select>') if end == 0: return [] country_list = [] for m in re.findall(ur'<option value="([A-Z]{2})">(.*?)</option>', sub): d = data.copy() d['country_code'] = m[0] d[cm.country_c] = m[1].strip() for key in [cm.country_e, cm.continent_e, cm.continent_c]: d[key] = '' ret = gs.look_up(d['country_code'], 1) if ret is not None: d[cm.country_e] = ret['name_e'] d[cm.country_c] = ret['name_c'] d[cm.continent_c] = ret['continent']['name_c'] d[cm.continent_e] = ret['continent']['name_e'] country_list.append(d)
def fetch_color(cls, response, spider=None): sel = Selector(response) region = None if 'userdata' in response.meta: region = response.meta['userdata']['region'] else: region = response.meta['region'] colors = [] if region != 'cn': try: # 商品信息在var productJSONObject中 mt = re.search(r'var\s+productJSONObject\s*=', response.body) if mt: data = json.loads(cm.extract_closure(response.body[mt.end():], "{", "}")[0].replace(r'\"', '"').replace(r"\'", "'")) colors = [cls.reformat(swatch['color']).lower() for swatch in data['swatchGroup']['swatches'] if 'color' in swatch] except (KeyError, ValueError, TypeError, IndexError): colors = None pass else: # TODO 没找到原爬虫解析中国的单品颜色的代码 pass return colors
def fetch_store_details(data): url = data['url'] try: body = cm.post_data(url, {'cCode': data['country_code'], 'city': data['city'], 'postsearch': 1}) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] start = body.find('<div class="store_locator') if start == -1: print 'Failed processing %s' % url return [] sub, start, end = cm.extract_closure(body[start:], ur'<div\b', ur'</div>') entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m = re.search(ur'<p><span class="bold">Address</span>(.+?)</p>', sub, re.S) if m is not None: addr_list = cm.reformat_addr(m.group(1)).split(', ') ret = cm.extract_tel(addr_list[-1]) if ret != '': entry[cm.tel] = ret del addr_list[-1] entry[cm.addr_e] = ', '.join(addr_list) addr_text=sub[m.end():] m = re.search(ur'<div class="title locator">', addr_text) if m is not None: tmp = cm.extract_closure(addr_text[m.start():], ur'<div\b', ur'</div>')[0] m1 = re.search(ur'<h2>(.+?)</h2>', tmp, re.S) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m = re.search(ur'google.maps.LatLng\(\s*(-?\d+\.\d+),\s*(-?\d+\.\d+)', body, re.S) if m is not None: entry[cm.lat] = string.atof(m.group(1)) entry[cm.lng] = string.atof(m.group(2)) entry[cm.country_e] = data['country_code'] entry[cm.city_e] = data['city'] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return [entry]
def get_store_details(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] entry = cm.init_store_entry(brand_id, brandname_e, brandname_c) entry[cm.name_e] = data['name'] entry[cm.url] = data['url'] start = html.find(ur'<div class="storelocator-breadcrumbs">') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<ul>', ur'</ul>') if end == 0: return [] # 最后一个<li>...</li> m = re.findall(ur'<li>(.+?)</li>', sub, re.S) if len(m) > 0: entry[cm.addr_e] = cm.reformat_addr(m[-1]) # 经纬度 m = re.findall(ur'position: new google.maps.LatLng\((-?\d+\.\d+).*?(-?\d+\.\d+)\)', html) if len(m) > 0: cm.update_entry(entry, {cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1])}) m = re.search(ur'<div class="contact right">(.+?)</div>', html, re.S) if m is not None: contact_sub = m.group(1) pat_tel = re.compile(ur'<p class="phone">(.+?)</p>') m1 = re.search(pat_tel, contact_sub) if m1: entry[cm.tel] = cm.extract_tel(m1.group(1)) contact_sub = re.sub(pat_tel, '', contact_sub) hours_list=[tmp.strip() for tmp in cm.reformat_addr(contact_sub).split(',')] if 'opening hours' in hours_list[0].lower(): del hours_list[0] entry[cm.hours] = ', '.join(hours_list) # Geo country = data['country'] city = data['city'] cm.update_entry(entry, {cm.country_e: country, cm.city_e: city}) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s / %d) Found store: %s, %s (%s, %s)' % ( brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) db.insert_record(entry, 'stores') return entry
def parse_product_details(self, response): metadata = response.meta['userdata'] sel = Selector(response) metadata['url'] = response.url name = self.fetch_name(response) if name: metadata['name'] = name model = self.fetch_model(response) if model: metadata['model'] = model else: return ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] description = self.fetch_description(response) if description: metadata['description'] = description detail = self.fetch_details(response) if detail: metadata['details'] = detail colors = self.fetch_color(response) if colors: metadata['color'] = colors image_urls = [] mt = re.search(r'var\s+jsoninit_item', response.body) if mt: idx = response.body[mt.regs[0][1]:].find('AVAILABLEZOOM') if idx != -1: idx += mt.regs[0][1] tmp = json.loads( cm.extract_closure(response.body[idx:], '{', '}')[0].replace("'", '"')) for c in tmp: model = metadata['model'] if re.search(c + '$', model, flags=re.I): # 找到放大的图像 image_urls = [ str.format('http://cdn.yoox.biz/{0}/{1}_{2}.jpg', model[:2], model, val) for val in tmp[c] ] item = ProductItem() item['image_urls'] = image_urls item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata return item
def fetch_uk(body, data): start = body.find(u'<div class="fableft">') if start == -1: print "Error in finding %s stores" % data["name"] return [] body, start, end = cm.extract_closure(body[start:], ur"<div\b", ur"</div>") if end == 0: print "Error in finding %s stores" % data["name"] return [] store_list = [] for m in re.findall(ur"<div>\s*(.+?)\s*</div>", body, re.S): entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"]) entry[cm.country_e] = data["name"] addr_list = re.findall(ur"<p>\s*(.+?)\s*</p>", m) tel = cm.extract_tel(addr_list[-1]) if tel != "": entry[cm.tel] = tel del addr_list[-1] if data["name"] == "AUSTRALIA": country, province, city = gs.addr_sense(", ".join(addr_list), data["name"]) if city is not None: entry[cm.city_e] = city if province is not None: entry[cm.province_e] = province else: city = addr_list[-2].strip().upper() entry[cm.city_e] = city ret = gs.look_up(city, 3) if ret is not None and ret["country"]["name_e"] == gs.look_up("UK", 1)["name_e"]: entry[cm.city_e] = ret["name_e"] entry[cm.zip_code] = addr_list[-1].strip().upper() entry[cm.addr_e] = ", ".join(addr_list) entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0] gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == "": entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == "": entry[cm.city_e] = ret[2] gs.field_sense(entry) print "(%s / %d) Found store: %s, %s (%s, %s, %s)" % ( data["brandname_e"], data["brand_id"], entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e], entry[cm.continent_e], ) db.insert_record(entry, "stores") store_list.append(entry)
def fetch_stores(data): url = data['url'] param = {'country_id': data['country_code'], 'city': '', 'label_id': '', 'lang': 'en'} try: body = cm.get_data(url, param) except Exception: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return [] start = body.find(ur'<stores>') if start == -1: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return [] body = cm.extract_closure(body[start:], ur'<stores>', ur'</stores>')[0] store_list=[] for m in re.findall(ur'<store\b[^<>]+>(.+?)</store>', body): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = data['country_code'] m1 = re.search(ur'<name>(.+?)</name>', m) if m1 is not None: entry[cm.name_e] = cm.reformat_addr(m1.group(1).strip()) m1 = re.search(ur'<address>(.+?)</address>', m) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1).strip()) m1 = re.search(ur'<city>(.+)</city>', m) if m1 is not None: entry[cm.city_e] = cm.extract_city(m1.group(1))[0] m1 = re.search(ur'<zip>(.+?)</zip>', m) if m1 is not None: entry[cm.zip_code] = m1.group(1).strip() m1 = re.search(ur'<tel>(.+?)</tel>', m) if m1 is not None: entry[cm.tel] = m1.group(1).strip() m1 = re.search(ur'<fax>(.+?)</fax>', m) if m1 is not None: entry[cm.fax] = m1.group(1).strip() m1 = re.search(ur'<email>(.+?)</email>', m) if m1 is not None: entry[cm.email] = m1.group(1).strip() m1 = re.search(ur'<link>(.+?)</link>', m) if m1 is not None: entry[cm.url] = m1.group(1).strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_store_list(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching countries: %s' % url, log_name) return [] start = body.find(ur"'country_select'") if start == -1: cm.dump('Error in fetching countries: %s' % url, log_name) return [] country_raw = json.loads(cm.extract_closure(body[start:], ur'\[', ur'\]')[0]) country_map = {} for c in country_raw: country_map[string.atoi(c['id'])] = c['name'] start = body.find(ur'loadQuickSearch') if start == -1: cm.dump('Error in fetching store list: %s' % url, log_name) return [] raw = json.loads(cm.extract_closure(body[start:], ur'\[', ur'\]')[0]) city_map = {} results = [] for item in raw: if item['type'] == 'city': country = country_map[string.atoi(item['parent_id'])] city_map[string.atoi(item['id'])] = {'name': item['name'], 'country': country} for item in raw: if item['type'] == 'store': d = data.copy() d['name'] = item['name'] d['city'] = dict(city_map[string.atoi(item['parent_id'])]) d['url'] = data['url'] + item['store_url_alias'] d['id'] = string.atoi(item['id']) results.append(d) elif item['type'] == 'city': continue return results
def fetch_cities(data): # url = data['post_url'] # try: # action=yoox_storelocator_change_country&country_id=3125&dataType=JSON # js = json.loads(cm.post_data(url, {'action': 'yoox_storelocator_change_country', # 'country_id': , # 'retail_type': data['retail_type']}).decode('unicode_escape')) # except Exception: # print 'Error occured in getting country list: %s' % url # dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} # cm.dump(dump_data) # return [] url = data['home_url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] # 建立国家和城市列表 country_map = {} city_map = {} start = html.find('<div id="storelocator-box-select-country"') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<div\b', ur'</div>') for m1 in re.finditer(ur'<a href=".+?" class="depth-1" data-store-id="(\d+)">(.+?)</a>', sub): country_id = string.atoi(m1.group(1)) country_e = m1.group(2).strip().upper() country_map[country_id] = country_e city_sub, s1, e1 = cm.extract_closure(sub[m1.end():], ur'<ul\b', ur'</ul>') for m2 in re.findall(ur'<li class=".+?"><a href=".+?" class="depth-2" data-store-id="(\d+)">(.+?)</a></li>', city_sub): city_id = string.atoi(m2[0]) city_e = m2[1].strip().upper() city_map[city_id] = {'city_e': city_e, 'parent': country_id}
def fetch_price(cls, response, spider=None): sel = Selector(response) ret = {} region = None if 'userdata' in response.meta: region = response.meta['userdata']['region'] else: region = response.meta['region'] old_price = None new_price = None if region != 'cn': # 商品信息在var productJSONObject中 mt = re.search(r'var\s+productJSONObject\s*=', response.body) if mt: try: data = json.loads(cm.extract_closure(response.body[mt.end():], "{", "}")[0].replace(r'\"', '"').replace(r"\'", "'")) except(TypeError, IndexError, ValueError): return ret # 价格信息 try: for item in data['swatchGroup']['swatches']: if 'listPrice' in item: old_price = cls.reformat(item['listPrice']) if 'unitPrice' in item: new_price = cls.reformat(item['unitPrice']) break except KeyError: pass else: tmp = sel.xpath('//div[@id="hidden_sku_value"]/input[@id="skuCode" and @value]') sku_code = None if tmp: sku_code = tmp[0]._root.attrib['value'] if sku_code: # 价格信息 return Request(url=cls.spider_data['price_url'][region], method='POST', dont_filter=True, body=str.format('skuCode={0}', sku_code), callback=cls.fetch_price_request, errback=spider.onerror, headers={'Content-Type': 'application/x-www-form-urlencoded', 'Accept-Encoding': 'gzip,deflate,sdch', 'X-Requested-With': 'XMLHttpRequest', 'Accept': '*/*'}, meta=response.meta) if old_price: ret['price'] = old_price if new_price: ret['price_discount'] = new_price return ret
def fetch_store_list(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching store lists: %s' % url, log_name) return [] start = body.find(ur"<div class='store-country'>") if start == -1: cm.dump('Error in fetching store lists: %s' % url, log_name) return [] body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0] start_stores = body.find(ur'<h3><a href="/store-locator/index">Stores</a></h3>') start_outlets = body.find(ur"<h3 class='outlets'>") store_sub = body[start_stores:start_outlets] outlet_sub = body[start_outlets:] results = [] for m1 in re.finditer(ur'<a [^<>]*data-id="([^"]+)"[^<>]*data-type="country">([^<>]+)</a>', store_sub): country_id = string.atoi(m1.group(1)) country = m1.group(2).strip() sub1 = cm.extract_closure(store_sub[m1.end():], ur'<ul>', ur'</ul>')[0] for m2 in re.finditer(ur'<a [^<>]*data-id="([^"]+)"[^<>]*data-type="city">([^<>]+)</a>', sub1): city_id = string.atoi(m2.group(1)) city = m2.group(2).strip() sub2 = cm.extract_closure(sub1[m2.end():], ur'<ul>', ur'</ul>')[0] for m3 in re.finditer(ur'<a href="([^"]+)"[^<>]*data-id="([^"]+)"[^<>]*data-type="store">([^<>]+)</a>', sub2): d = data.copy() d['country_id'] = country_id d['country'] = country d['city_id'] = city_id d['city'] = city d['url'] = m3.group(1).strip() d['store_id'] = string.atoi(m3.group(2)) d['store'] = cm.html2plain(m3.group(3).strip()) # d['store_type'] = 'store' results.append(d)
def get_subcat(html, pat): """ 比如,dunhill需要单独拿出来 :param html: :param pat: """ it = re.finditer(pat, html) try: m = it.next() sub_html, start, end = common.extract_closure(html[m.start() :], ur"<ul\b", ur"</ul>") return sub_html except StopIteration: return ""
def fetch_countries(data): url = data['home_url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] start = html.find(u'<div id="block-ps-shop-locator-shop-locator-filters"') if start == -1: return [] html, start, end = cm.extract_closure(html[start:], ur'<div\b', ur'</div>') if end == 0: return [] country_list = [] for m in re.finditer(ur'<h3>(.+?)</h3>', html): continent_e = m.group(1).strip().upper() if continent_e == u'UK' and False: d = data.copy() d[cm.continent_e] = u'EUROPE' d[cm.country_e] = u'UNITED KINGDOM' d[cm.url] = data['host'] + '/uk-en/shop-locator/gb/all' country_list.append(d) else: sub, start, end = cm.extract_closure(html[m.end():], ur'<ul\b', ur'</ul>') if end == 0: continue #<a href="/uk-en/shop-locator/fr/all">France</a> for m1 in re.findall(ur'<a href="(.+?)">(.+?)</a>', sub): d = data.copy() d[cm.continent_e] = continent_e d[cm.country_e] = m1[1].strip().upper() d[cm.url] = data['host'] + m1[0] if d[cm.country_e]=='SINGAPORE': country_list.append(d)
def fetch_world(body, data): start = body.find(u'<div class="fableft">') if start == -1: print "Error in finding %s stores" % data["name"] return [] body, start, end = cm.extract_closure(body[start:], ur"<div\b", ur"</div>") if end == 0: print "Error in finding %s stores" % data["name"] return [] idx_list = [] for m in re.finditer(ur"<h2>(.+?)</h2>", body): idx_list.append({"idx": m.end(), "name": m.group(1).strip().upper()})
def fetch_store_details(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching store details: %s' % url, log_name) return [] start = body.find(ur'<div class="col first" itemprop="address"') if start == -1: cm.dump('Error in fetching store details: %s' % url, log_name) return [] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) addr = cm.extract_closure(body[start:], ur'<p>', ur'</p>')[0] m = re.search(ur'<span itemprop="postalCode">([^<>]+)</span>', addr, re.S) if m is not None: entry[cm.zip_code] = m.group(1).strip() entry[cm.addr_e] = cm.reformat_addr(addr) start = body.find(ur'<div class="col" itemprop="contactPoints"') if start != -1: sub = cm.extract_closure(body[start:], ur'<p>', ur'</p>')[0] m = re.search(ur'<span itemprop="telephone">([^<>]+)</span>', sub, re.S) if m is not None: entry[cm.tel] = m.group(1).strip() m = re.search(ur'<span itemprop="faxNumber">([^<>]+)</span>', sub, re.S) if m is not None: entry[cm.fax] = m.group(1).strip() start = body.find(ur'<h2>opening hours</h2>') if start != -1: sub = cm.extract_closure(body[start:], ur'<table\b', ur'</table>')[0] tmp = [] for m in re.findall(ur'<td>(.+?)</td>', sub): tmp.append(cm.html2plain(m).strip()) entry[cm.hours] = ' '.join(tmp)
def fetch_indv(data): url = data['url'] try: body = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] body, start, end = cm.extract_closure(body, ur'<article\b', ur'</article>') tmp = [] for m in re.finditer(ur'<h2>\s*(.+?)\s*</h2>', body): tmp.append({'idx1': m.start(), 'idx2': m.end(), 'name': m.group(1).strip().upper()})
def fetch_hk(data): loc_list = ('Hong Kong', 'Kowloon', 'Macau', 'New Territories') url = 'http://levi.com.hk/hk/storelocator' store_list = [] for loc in loc_list: param = {'loc': loc} try: body = cm.get_data(url, param) except Exception, e: cm.dump('Error in fetching stores: %s' % param, log_name) continue start = body.find(ur'<div id="addWrapper">') if start == -1: cm.dump('Error in fetching stores: %s' % param, log_name) continue sub = cm.extract_closure(body[start:], ur'<ul>', ur'</ul>')[0] for s in re.findall(ur'<li>(.+?)</li>', sub, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) entry[cm.country_e] = 'MACAU' if loc == 'Macau' else 'HONG KONG' entry[cm.city_e] = entry[cm.country_e] m = re.search(ur'<div id="addStore">([^<>]+)', s) entry[cm.addr_e] = cm.html2plain(m.group(1)) if m else '' m = re.search(ur'<div id="addAddress">([^<>]+)', s) tmp = cm.html2plain(m.group(1)) pat = re.compile(ur'business hours?\s*[:\.]?\s*', re.I) if re.search(pat, tmp): entry[cm.hours] = re.sub(pat, '', tmp).strip() m = re.search(ur'<div id="addPhone">([^<>]+)', s) tmp = cm.html2plain(m.group(1)) pat = re.compile(ur'(tel|phone|telephone)?\s*[:\.]?\s*', re.I) if re.search(pat, tmp): entry[cm.tel] = re.sub(pat, '', tmp).strip() gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_states(data): print '(%s/%d) Found country: %s' % (data['brandname_e'], data['brand_id'], data['country_e']) url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] # 处理重定向 m = re.search('<h2>Object moved to <a href="(.+?)">', html) if m is not None: data['url'] = data['host'] + m.group(1) return fetch_countries(data) m = re.search('<span class="state">Choose a state/provence</span>', html) if m is None: return [] sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>') if end == 0: return [] state_list = [] for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub): province_e = cm.html2plain(m[1]).strip().upper() if data['country_e'] == 'CHINA': # 去掉省中间的空格 province_e = province_e.replace(' ', '') ret = gs.look_up(province_e, 2) if ret is not None: province_e = ret['name_e'] d = data.copy() d['province_e'] = province_e d['url'] = data['host'] + m[0] state_list.append(d) return state_list
def fetch_model(cls, response, spider=None): sel = Selector(response) model = None try: mt = re.search(r'var\s+jsoninit_dejavu\s*=\s*\{\s*ITEM:', response.body) if not mt: return tmp = json.loads( cm.extract_closure(response.body[mt.regs[0][1]:], '{', '}')[0]) if 'cod10' in tmp: model = tmp['cod10'] except (TypeError, IndexError): pass return model.upper() if model else None
def parse_details(self, response): metadata = response.meta['userdata'] metadata['url'] = response.url sel = Selector(response) model = self.fetch_model(response) if model: metadata['model'] = model else: return ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] name = self.fetch_name(response) if name: metadata['name'] = name description = self.fetch_description(response) if description: metadata['description'] = description image_urls = [] for tmp in sel.xpath('//a[@href and @class="switchACss" and @rel]/@rel').extract(): try: idx = tmp.find('largeimage') if idx == -1: continue image_url = self.process_href(cm.extract_closure(tmp[idx:], "'", "'")[0][1:-1], response.url) if image_url not in image_urls: image_urls.append(image_url) except (KeyError, ValueError, IndexError): continue item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] item['image_urls'] = image_urls item['metadata'] = metadata yield item
def fetch_price(cls, response, spider=None): sel = Selector(response) ret = {} old_price = None new_price = None try: mt = re.search(r'var\s+jsoninit_dejavu\s*=\s*\{\s*ITEM:', response.body) if not mt: return tmp = json.loads( cm.extract_closure(response.body[mt.regs[0][1]:], '{', '}')[0]) if 'price' in tmp: old_price = tmp['price'] except (TypeError, IndexError): pass if old_price: ret['price'] = old_price if new_price: ret['price_discount'] = new_price return ret
def parse_details_us(self, response): metadata = response.meta['userdata'] sel = Selector(response) metadata['url'] = response.url # 查找不同的颜色版本 try: idx = response.body.find('var productURLs') data = json.loads(cm.extract_closure(response.body[idx:], '\{', '\}')[0].replace("'", '"')) for color_key in data: tmp = sel.xpath(str.format('//select/option[@value="{0}"]', color_key)) if not tmp: continue color_node = tmp[0] # 是否为当前选择的颜色? if not color_node.xpath('@selected'): m = copy.deepcopy(metadata) tmp = color_node.xpath('text()').extract() if tmp: m['color'] = [self.reformat(tmp[0])] yield Request(url=self.process_href(data[color_key], response.url), callback=self.spider_data['callbacks'][metadata['region']][2], errback=self.onerr, meta={'userdata': m}) else: tmp = color_node.xpath('text()').extract() if tmp: metadata['color'] = [self.reformat(tmp[0])] except ValueError: pass name = self.fetch_name(response) if name: metadata['name'] = name model = self.fetch_model(response) if model: metadata['model'] = model else: return description = self.fetch_description(response) if description: metadata['description'] = description ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] detail = self.fetch_details(response) if detail: metadata['details'] = detail image_urls = [] for img_node in sel.xpath('//div[contains(@class,"slider_selector") or @id="frg_thumb_list"]/ul' '/li[contains(@id,"productAngle")]//img[@src or @data-url]'): tmp = img_node.xpath('@data-url').extract() if tmp: image_urls.append(self.process_href(tmp[0], response.url)) else: tmp = img_node.xpath('@src').extract()[0] a, b = os.path.splitext(tmp) image_urls.append(self.process_href(str.format('{0}_zoom{1}', a, b), response.url)) #image_urls = [self.process_href(val, response.url) for val in # sel.xpath('//div[contains(@class,"slider_selector") or @id="frg_thumb_list"]/ul' # '/li[contains(@id,"productAngle")]/img[@src and @data-url]/@data-url').extract()] item = ProductItem() item['image_urls'] = image_urls item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata yield item
def parse_fashion(self, response): self.log(str.format('PARSE_FASHION: {0}', response.url), level=log.DEBUG) mt = re.search(r'chanel\.com/([^/]+)/', response.url) region = None for a, b in self.spider_data['base_url'].items(): if b == mt.group(1): region = a break if not region: self.log(str.format('NO VAR SETTINGS: {0}', response.url), log.ERROR) return metadata = { 'region': region, 'brand_id': self.spider_data['brand_id'], 'url': response.url, 'tags_mapping': {} } mt = re.search(r'var\s+settings', response.body) if not mt: self.log(str.format('NO VAR SETTINGS: {0}', response.url), log.ERROR) return content = cm.extract_closure(response.body[mt.start():], '{', '}')[0] try: data = json.loads(content) except ValueError: self.log( str.format('FAILED TO LOAD VAR SETTINGS: {0}', response.url), log.ERROR) return try: metadata['pricing_service'] = data['servicesURL']['pricing'] except KeyError: metadata['pricing_service'] = None # images metadata['image_urls'] = set([]) if 'detailsGridJsonUrl' in data['sectionCache']: temp = data['sectionCache']['detailsGridJsonUrl'] if re.search(r'^http://', temp): url = temp else: url = str.format('{0}{1}', self.spider_data['hosts'][region], temp) try: proxy_enabled = self.crawler.settings.values['PROXY_ENABLED'] except IndexError: proxy_enabled = False yield ProxiedRequest(url=url, meta={'userdata': metadata}, callback=self.parse_json_request, dont_filter=True, errback=self.onerr, proxy_region=metadata['region'], proxy_enabled=proxy_enabled) else: for val in self.parse_json(metadata, data['sectionCache']): yield val
def parse_details(self, response): # 确定所属国家 region = None for tmp in self.spider_data['domains']: if self.spider_data['domains'][tmp] in response.url: region = tmp break if not region: return metadata = {'region': region, 'brand_id': self.spider_data['brand_id'], 'tags_mapping': {}, 'url': response.url} # 根据referer,获得category信息 referer = response.request.headers['Referer'] if referer not in self.url_cat_dict: return Request(url=referer, callback=self.parse_cat, meta={'stash': response, 'coach-referer': referer, 'callback': self.parse_details}, errback=self.onerr, dont_filter=True) tag_list = self.url_cat_dict[referer] for tag in tag_list: metadata['tags_mapping'][tag['type']] = [{'name': tag['name'], 'title': tag['title']}] # 商品信息在var productJSONObject中 mt = re.search(r'var\s+productJSONObject\s*=', response.body) if not mt: return try: data = json.loads(cm.extract_closure(response.body[mt.end():], "{", "}")[0].replace(r'\"', '"').replace(r"\'", "'")) except(TypeError, IndexError, ValueError): return if 'style' not in data: return metadata['model'] = data['style'] if 'productName' in data: metadata['name'] = self.reformat(data['productName']) try: metadata['color'] = [self.reformat(swatch['color']).lower() for swatch in data['swatchGroup']['swatches'] if 'color' in swatch] except KeyError: pass # 价格信息 try: for item in data['swatchGroup']['swatches']: if 'listPrice' in item: metadata['price'] = self.reformat(item['listPrice']) if 'unitPrice' in item: metadata['price_discount'] = self.reformat(item['unitPrice']) break except KeyError: pass # 图像链接 image_urls = [] try: image_host = 'http://s7d2.scene7.com/is/image/Coach/{0}{1}' style_for_images = data['styleForImages'] for item in data['swatchGroup']['swatches']: for subimg in ('aImages', 'nImages', 'mImages'): for tmp in [val['imageName'] for val in item[subimg]]: if tmp not in image_urls: image_urls.append(tmp) image_urls = [str.format(image_host, style_for_images, val) for val in image_urls] except KeyError: pass item = ProductItem() item['image_urls'] = image_urls item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata return item
def parse_details(self, response): metadata = response.meta['userdata'] metadata['url'] = response.url sel = Selector(response) name = self.fetch_name(response) if name: metadata['name'] = name detail = self.fetch_details(response) if detail: metadata['details'] = detail model = self.fetch_model(response) if model: metadata['model'] = model else: return description = self.fetch_description(response) if description: metadata['description'] = description ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] colors = self.fetch_color(response) if colors: metadata['color'] = colors # image_urls = sel.xpath('//div[@id="itemContent"]//img/@src').extract() # 获得图片 hdr = None tail = None img0 = sel.xpath( '//meta[@property="og:image" and @content]/@content').extract() if img0: img0 = img0[0] mt = re.search(r'(.+)_\d+_\w(\..+)$', img0) if mt: hdr = mt.group(1) tail = mt.group(2) idx = response.body.find('jsinit_item') img_item = None if idx != -1: tmp = response.body[idx:] idx = tmp.find('ALTERNATE') if idx != -1: try: img_item = json.loads( cm.extract_closure(tmp[idx:], r'\[', r'\]')[0]) except ValueError: pass image_urls = [] if hdr and tail and img_item: for item in img_item: mt = re.search(r'(\d+)_\w', item) if not mt: continue start_idx = int(mt.group(1)) for idx in xrange(start_idx, 15): tmp = re.sub(r'\d+_(\w)', str.format(r'{0}_\1', idx), item) image_urls.append(str.format('{0}_{1}{2}', hdr, tmp, tail)) item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] item['image_urls'] = image_urls item['metadata'] = metadata yield item
def fetch_stores(data): print '(%s/%d) Found city: %s' % (data['brandname_e'], data['brand_id'], data['city_e']) url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] # 处理重定向 m = re.search('<h2>Object moved to <a href="(.+?)">', html) if m is not None: data['url'] = data['host'] + m.group(1) return fetch_countries(data) m = re.search('var\s+data\s*=\s*', html) if m is None: return [] sub, start, end = cm.extract_closure(html[m.end():], r'\[', r'\]') if end == 0: return [] store_list = [] for s in json.loads(sub): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) name = s['Name'] if cm.is_chinese(name): entry[cm.name_c] = name else: entry[cm.name_e] = name entry[cm.addr_e] = cm.html2plain(s['Street']) entry[cm.city_e] = cm.extract_city(data['city_e'])[0] entry[cm.country_e] = data['country_e'] entry[cm.province_e] = data['province_e'] pat = re.compile(ur'tel[\.: ]*', re.I) entry[cm.tel] = re.sub(pat, '', s['Phone']).strip() pat = re.compile(ur'fax[\.: ]*', re.I) entry[cm.fax] = re.sub(pat, '', s['Fax']).strip() entry[cm.email] = s['Email'].strip() entry[cm.url] = s['Website'].strip() coord = s['LatLng'] if coord is not None and len(coord) >= 2: if coord[0] is not None: entry[cm.lat] = string.atof(coord[0]) if coord[1] is not None: entry[cm.lng] = string.atof(coord[1]) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None and entry[cm.province_e] == '': entry[cm.province_e] = ret[1] if ret[2] is not None and entry[cm.city_e] == '': entry[cm.city_e] = ret[2] gs.field_sense(entry) print '(%s/%d) Found store: %s, %s (%s, %s)' % ( data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores') return store_list
def fetch_price(cls, response, spider=None): sel = Selector(response) ret = {} response.meta['url'] = response.url if 'userdata' in response.meta: region = response.meta['userdata']['region'] else: region = response.meta['region'] region_code = '|'.join(cls.spider_data['base_url'][reg] for reg in cls.get_supported_regions()) watch_code = [] for r in cls.get_supported_regions(): if r in cls.spider_data['watch_term']: watch_code.extend(cls.spider_data['watch_term'][r]) watch_code = '|'.join(watch_code) old_price = None new_price = None mt = re.search( unicode.format(ur'chanel\.com/({0})/({1})/.+', region_code, watch_code), response.url) if mt: # 对应 parse_watch price_url = str.format( 'http://www-cn.chanel.com/{0}/{1}/collection_product_detail?product_id={2}&maj=price', cls.spider_data['base_url'][region], cls.spider_data['watch_term'][region][0], cls.fetch_model(response)) return ProxiedRequest(url=price_url, callback=cls.fetch_price_request_watch, errback=spider.onerror, meta=response.meta, proxy_enabled=True, proxy_region=region) else: mt = re.search( str.format(r'chanel\.com/({0})/.+\?sku=\d+$', region_code), response.url) if mt: # 对应 parse_sku1 # TODO 这种类型url找不到原来取价格的代码 pass else: mt = re.search( str.format(r'chanel\.com/({0})/.+/sku/\d+$', region_code), response.url) if mt: # 对应 parse_sku2 temp = sel.xpath( '//div[contains(@class, "product_detail_container")]') if len(temp) > 0: product_name = temp[0] temp = product_name.xpath( './/h3[@class="product_price"]') if len(temp) > 0: old_price = unicodify(temp[0]._root.text) else: mt = re.search( str.format(r'chanel\.com/({0})/.+(?<=/)s\.[^/]+\.html', region_code), response.url) if mt: mt = re.search(r'var\s+settings', response.body) content = cm.extract_closure( response.body[mt.start():], '{', '}')[0] try: data = json.loads(content) if 'detailsGridJsonUrl' in data['sectionCache']: temp = data['sectionCache'][ 'detailsGridJsonUrl'] if re.search(r'^http://', temp): url = temp else: url = str.format( '{0}{1}', cls.spider_data['hosts'][region], temp) return ProxiedRequest( url=url, meta=response.meta, callback=cls. fetch_price_request_fashion_json, proxy_enabled=True, proxy_region=region, dont_filter=True, errback=spider.onerror) else: return cls.fetch_price_request_fashion( response.meta, data['sectionCache'], spider) except (KeyError, TypeError, IndexError): pass else: pass if old_price: ret['price'] = old_price if new_price: ret['price_discount'] = new_price return ret
def parse_details(self, response): def func(product_id): m = copy.deepcopy(metadata) # if product_id in data['simpleProductPrices']: # m['price'] = data['simpleProductPrices'][product_id] image_url = data['baseImages'][product_id] # 尝试找到zoom图 zoom_image_url = re.sub(r'/default/([^/]+)$', r'/zoom/\1', image_url) if zoom_image_url in unicodify(response.body): image_url = zoom_image_url elif zoom_image_url.replace('/', r'\/') in unicodify(response.body): image_url = zoom_image_url # m['description'] = self.reformat(data['descriptions'][product_id]) # m['name'] = self.reformat(data['names'][product_id]) # m['model'] = data['skus'][product_id] # # TODO 这里有可能导致网页的url找错,例如:http://usa.hermes.com/jewelry/gold-jewelry/bracelets/configurable-product-104820b-23578.html # if product_id in data['links']: # m['url'] = data['links'][product_id] # else: # m['url'] = response.url # for attrib in data['attributes']: attrib_name = attrib['code'] # if re.search(r'color[\b_]', attrib_name): # attrib_name = 'color' # elif re.search('size_sized', attrib_name): # attrib_name = 'size' temp = [ unicodify(val['label']).lower() for val in attrib['options'] if product_id in val['products'] ] # if attrib_name == 'color': # m['color'] = temp # else: # m['tags_mapping'][unicodify(attrib_name).lower()] = \ # [{'name': val.lower(), 'title': val} for val in temp] if attrib_name != 'color': m['tags_mapping'][unicodify(attrib_name).lower()] = \ [{'name': val.lower(), 'title': val} for val in temp] # if 'category-1' in m['tags_mapping']: # m['category'] = [val['name'] for val in m['tags_mapping']['category-1']] item = ProductItem() item['image_urls'] = [image_url] item['url'] = m['url'] item['model'] = m['model'] item['metadata'] = m return item metadata = response.meta['userdata'] metadata['url'] = response.url model = self.fetch_model(response) if model: metadata['model'] = model else: return name = self.fetch_name(response) if name: metadata['name'] = name colors = self.fetch_color(response) if colors: metadata['color'] = colors ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] description = self.fetch_description(response) if description: metadata['description'] = description idx = response.body.find('spConfig.init') if idx == -1: idx = response.body.find('ConfProduct.init') if idx == -1: return body = cm.extract_closure(response.body[idx:], '{', '}')[0] data = json.loads(body) for val in (func(product_id) for product_id in data['productIds']): yield val
def parse_product(self, response): metadata = response.meta['userdata'] sel = Selector(response) # 这里不进入其他页面,因为后边找图片的方法,可以把所有颜色的图片找全 # # 其他颜色页面 # color_href_nodes = sel.xpath('//div[@class="variationattributes"]/div[@class="swatches color"]/ul/li/a[@href]') # for node in color_href_nodes: # m = copy.deepcopy(metadata) # # href = node.xpath('./@href').extract()[0] # href = self.process_href(href, response.url) # # Request(url=href, # callback=self.parse_product, # errback=self.onerr, # meta={'userdata': m}) metadata['url'] = response.url model = self.fetch_model(response) if model: metadata['model'] = model else: return ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] name = self.fetch_name(response) if name: metadata['name'] = name colors = self.fetch_color(response) if colors: metadata['color'] = colors description = self.fetch_description(response) if description: metadata['description'] = description detail = self.fetch_details(response) if detail: metadata['details'] = detail image_urls = [] try: start = 0 while 1: mt = re.search(r'xlarge:', response.body[start:]) if mt: result = common.extract_closure(response.body[mt.start():], '\[', '\]') content = result[0] start = result[2] if 0 == start: break url_list = re.findall('"url":.*\'(.+)\?.*\'', content) for url in url_list: image_urls += [self.process_href(url, response.url)] else: break except (TypeError, IndexError): pass item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] if image_urls: item['image_urls'] = image_urls item['metadata'] = metadata yield item