示例#1
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    store_list = []
    for m in re.finditer(ur'<item id="\d+">', body):
        sub = cm.extract_closure(body[m.start():], ur'<item\b', ur'</item>')[0]
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        m1 = re.search(ur'<country>([^<>]+)</country>', sub)
        if m1 is not None:
            tmp = m1.group(1).split('/')
            for v in tmp:
                ret = gs.look_up(v.strip().upper(), 1)
                if ret is not None:
                    entry[cm.country_e] = ret['name_e']
                    break
        m1 = re.search(ur'<city>([^<>]+)</city>', sub)
        if m1 is not None:
            val = cm.reformat_addr(m1.group(1))
            if entry[cm.country_e] == 'UNITED STATES':
                tmp_list = tuple(tmp.strip() for tmp in cm.reformat_addr(val).strip(','))
                if len(tmp_list) == 2:
                    if re.search('[A-Z]{2}', tmp_list[1]):
                        entry[cm.province_e] = tmp_list[1]
            entry[cm.city_e] = cm.extract_city(m1.group(1))[0]
        m1 = re.search(ur'<brands>([^<>]+)</brands>', sub)
        if m1 is not None:
            tmp = m1.group(1).split('/')
            brand_list = []
            for v in tmp:
                if v.strip() != '':
                    brand_list.append(v)
            entry[cm.store_type] = ', '.join(brand_map[key] for key in brand_list)
        m1 = re.search(ur'<name>([^<>]+)</name>', sub)
        if m1 is not None:
            entry[cm.name_e] = m1.group(1).strip()
        m1 = re.search(ur'<address>([^<>]+)</address>', sub)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1))
        m1 = re.search(ur'<tel>([^<>]+)</tel>', sub)
        if m1 is not None:
            entry[cm.tel] = m1.group(1).strip()
        m1 = re.search(ur'sll=(-?\d+\.\d+),(-?\d+\.\d+)', sub)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))
            entry[cm.lng] = string.atof(m1.group(2))
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None:
            entry[cm.province_e] = ret[1]
            gs.field_sense(entry)
        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
示例#2
0
    def get_detailed_store(html, store_cat):
        store_list = []
        start = 0
        while True:
            sub_html, start, end = common.extract_closure(html, ur"<li\b", ur"</li>")
            if end == 0:
                break

            # 得到单个门店的页面代码
            html = html[end:]
            entry = common.init_store_entry(brand_id, brandname_e, brandname_c)

            m = re.findall(ur'<div class="store-title -h3a">(.+?)</div>', sub_html)
            if len(m) > 0:
                entry[common.name_e] = common.reformat_addr(m[0])
            m = re.findall(ur'<div class="store-address">(.+?)</div>', sub_html, re.S)
            if len(m) > 0:
                addr = common.reformat_addr(m[0])
                # 最后一行是否为电话号码?
                terms = addr.split(", ")
                tel = common.extract_tel(terms[-1])
                if tel != "":
                    addr = ", ".join(terms[:-1])
                    entry[common.tel] = tel
                entry[common.addr_e] = addr

            # 获得门店类型
            # store_type = [store_cat]
            type_html, type_start, type_end = common.extract_closure(sub_html, ur'<ul class="service-list">', ur"</ul>")
            if type_end != 0:
                store_type = [m for m in re.findall(ur'<li class="service-item">(.+?)</li>', type_html)]
                store_type.insert(0, store_cat)
                entry[common.store_type] = ", ".join(store_type)
            else:
                entry[common.store_type] = store_cat

            # 获得经纬度
            m = re.findall(ur'data-latitude="(-?\d+\.\d+)"', sub_html)
            if len(m) > 0:
                entry[common.lat] = string.atof(m[0])
            m = re.findall(ur'data-longitude="(-?\d+\.\d+)"', sub_html)
            if len(m) > 0:
                entry[common.lng] = string.atof(m[0])

            entry[common.city_e] = common.extract_city(data[common.city_e])[0]
            entry[common.country_e] = common.reformat_addr(data[common.country_e]).strip().upper()
            gs.field_sense(entry)

            print "%s: Found store: %s, %s (%s, %s, %s)" % (
                brandname_e,
                entry[common.name_e],
                entry[common.addr_e],
                entry[common.city_e],
                entry[common.country_e],
                entry[common.continent_e],
            )
            db.insert_record(entry, "stores")
            store_list.append(entry)

        return store_list
示例#3
0
文件: liujo.py 项目: haizi-zh/firenze
def fetch_stores(data):
    param = {'action': 'getStoresFromAjax', 'country': data['country_code'],
             'region': data['city'], 'collection': ''}
    url = data['url']
    try:
        body = cm.post_data(url, param)
    except Exception:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
        return []

    store_list = []
    for m1 in re.finditer(ur'<div class="shop-type-container">', body):
        sub = cm.extract_closure(body[m1.start():], ur'<div\b', ur'</div>')[0]
        store_class = ''
        m2 = re.search(ur'<div class="shop-type-title">(.+?)</div>', sub, re.S)
        if m2 is not None:
            store_class = cm.reformat_addr(m2.group(1))

        for m2 in re.finditer(ur'<div class="shop"', sub):
            store_sub = cm.extract_closure(sub[m2.start():], ur'<div\b', ur'</div>')[0]
            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
            entry[cm.store_class] = store_class
            entry[cm.country_e] = data['country_code']
            entry[cm.city_e] = cm.extract_city(data['city'])[0]

            m3 = re.search(ur'loadStore\((\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\)', store_sub)
            if m3 is not None:
                data['store_id'] = string.atoi(m3.group(1))
                entry[cm.lat] = string.atof(m3.group(2))
                entry[cm.lng] = string.atof(m3.group(3))
                entry[cm.store_type] = ', '.join(get_detail(data))

            m3 = re.search(ur'<div class="shop-name shop-details shop-main-name">([^<>]+)</div>', store_sub)
            if m3 is not None:
                entry[cm.name_e] = m3.group(1).strip()
            addr_list = []
            m3 = re.search(ur'<div class="shop-street shop-details">([^<>]+)</div>', store_sub)
            if m3 is not None:
                addr_list.append(cm.reformat_addr(m3.group(1)))
            m3 = re.search(ur'<div class="shop-city shop-details">([^<>]+)</div>', store_sub)
            if m3 is not None:
                tmp = cm.reformat_addr(m3.group(1))
                m3 = re.search(ur'(\d{4,})', tmp)
                if m3 is not None:
                    entry[cm.zip_code] = m3.group(1).strip()
                addr_list.append(tmp)
            entry[cm.addr_e] = ', '.join(addr_list)

            ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
            if ret[1] is not None and entry[cm.province_e] == '':
                entry[cm.province_e] = ret[1]
            gs.field_sense(entry)
            cm.dump('(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                    entry[cm.name_e], entry[cm.addr_e],
                                                                    entry[cm.city_e],
                                                                    entry[cm.country_e],
                                                                    entry[cm.continent_e]), log_name)
            db.insert_record(entry, 'stores')
            store_list.append(entry)
示例#4
0
def get_store_details(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
    entry[cm.name_e] = data['name']
    entry[cm.url] = data['url']
    start = html.find(ur'<div class="storelocator-breadcrumbs">')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<ul>', ur'</ul>')
    if end == 0:
        return []
        # 最后一个<li>...</li>
    m = re.findall(ur'<li>(.+?)</li>', sub, re.S)
    if len(m) > 0:
        entry[cm.addr_e] = cm.reformat_addr(m[-1])
        # 经纬度
    m = re.findall(ur'position: new google.maps.LatLng\((-?\d+\.\d+).*?(-?\d+\.\d+)\)', html)
    if len(m) > 0:
        cm.update_entry(entry, {cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1])})

    m = re.search(ur'<div class="contact right">(.+?)</div>', html, re.S)
    if m is not None:
        contact_sub = m.group(1)
        pat_tel = re.compile(ur'<p class="phone">(.+?)</p>')
        m1 = re.search(pat_tel, contact_sub)
        if m1:
            entry[cm.tel] = cm.extract_tel(m1.group(1))
            contact_sub = re.sub(pat_tel, '', contact_sub)
        hours_list=[tmp.strip() for tmp in cm.reformat_addr(contact_sub).split(',')]
        if 'opening hours' in hours_list[0].lower():
            del hours_list[0]
        entry[cm.hours] = ', '.join(hours_list)

    # Geo
    country = data['country']
    city = data['city']
    cm.update_entry(entry, {cm.country_e: country, cm.city_e: city})
    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    print '(%s / %d) Found store: %s, %s (%s, %s)' % (
        brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
        entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return entry
示例#5
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    store_list = []
    for m1 in re.finditer(ur'<lignecountry\s+titre\s*=\s*"([^"]+)"', body):
        country = m1.group(1).strip().upper()
        if country == 'U.S.A.':
            country = 'US'
        sub_country = cm.extract_closure(body[m1.start():], ur'<lignecountry\b', ur'</lignecountry>')[0]
        for m2 in re.finditer(ur'<lignecity\s+titre\s*=\s*"([^"]+)"', sub_country):
            city = m2.group(1).strip().upper()
            sub_city = cm.extract_closure(sub_country[m2.start():], ur'<lignecity\b', ur'</lignecity>')[0]
            m3 = re.search(ur'<!\[CDATA\[(.+?)\]\]>', sub_city, re.S)
            if m3 is None:
                continue
            sub_city = m3.group(1)
            store_subs = re.split(ur'<\s*h2\s*>\s*LANVIN BOUTIQUE\s*<\s*/h2\s*>', sub_city)
            for s in store_subs:
                if s.strip() == '':
                    continue
                m4 = re.search(ur'<p>(.+?)</p>', s, re.S)
                if m4 is None:
                    continue
                entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
                entry[cm.country_e] = country
                entry[cm.city_e] = city
                s = m4.group(1)
                m4 = re.search(ur'(.+?)\n\s*\n', s, re.S)
                if m4 is not None:
                    entry[cm.addr_e] = cm.reformat_addr(m4.group(1))
                m4 = re.search(ur'Phone:(.+?)\n\s*\n', s, re.S)
                if m4 is not None:
                    entry[cm.tel] = cm.reformat_addr(m4.group(1).strip())
                m4 = re.search(ur'Boutique Hours:(.+?)\n\s*\n', s, re.S)
                if m4 is not None:
                    entry[cm.hours] = cm.reformat_addr(m4.group(1).strip())
                m4 = re.search(ur'Products available:(.+?)\n\s*\n', s, re.S)
                if m4 is not None:
                    entry[cm.store_type] = m4.group(1).strip()
                m4 = re.search(ur'Email:\s*<a href="mailto:([^"]+)">', s)
                if m4 is not None:
                    entry[cm.email] = m4.group(1).strip()
                gs.field_sense(entry)
                ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
                if ret[1] is not None and entry[cm.province_e] == '':
                    entry[cm.province_e] = ret[1]
                gs.field_sense(entry)
                cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                    entry[cm.name_e], entry[cm.addr_e],
                                                                    entry[cm.country_e],
                                                                    entry[cm.continent_e]), log_name)
                db.insert_record(entry, 'stores')
                store_list.append(entry)
示例#6
0
def fetch_stores(data):
    url = data['url']
    param = {'country_id': data['country_code'], 'city': '', 'label_id': '', 'lang': 'en'}
    try:
        body = cm.get_data(url, param)
    except Exception:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
        return []

    start = body.find(ur'<stores>')
    if start == -1:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
        return []
    body = cm.extract_closure(body[start:], ur'<stores>', ur'</stores>')[0]

    store_list=[]
    for m in re.findall(ur'<store\b[^<>]+>(.+?)</store>', body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['country_code']
        m1 = re.search(ur'<name>(.+?)</name>', m)
        if m1 is not None:
            entry[cm.name_e] = cm.reformat_addr(m1.group(1).strip())
        m1 = re.search(ur'<address>(.+?)</address>', m)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1).strip())
        m1 = re.search(ur'<city>(.+)</city>', m)
        if m1 is not None:
            entry[cm.city_e] = cm.extract_city(m1.group(1))[0]
        m1 = re.search(ur'<zip>(.+?)</zip>', m)
        if m1 is not None:
            entry[cm.zip_code] = m1.group(1).strip()
        m1 = re.search(ur'<tel>(.+?)</tel>', m)
        if m1 is not None:
            entry[cm.tel] = m1.group(1).strip()
        m1 = re.search(ur'<fax>(.+?)</fax>', m)
        if m1 is not None:
            entry[cm.fax] = m1.group(1).strip()
        m1 = re.search(ur'<email>(.+?)</email>', m)
        if m1 is not None:
            entry[cm.email] = m1.group(1).strip()
        m1 = re.search(ur'<link>(.+?)</link>', m)
        if m1 is not None:
            entry[cm.url] = m1.group(1).strip()

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
示例#7
0
文件: loewe.py 项目: haizi-zh/firenze
def fetch_stores(data):
    url = data['store_url']
    param = {'store_country': data['country_code'], 'store_city': data['city_code']}
    try:
        body = cm.post_data(url, param)
    except Exception:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
        return []

    store_list = []
    for s in re.findall(ur'<marker\b([^<>]+)/\s*>', body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        m = re.search(ur'store_name="([^"]+)"', s)
        if m is not None:
            entry[cm.name_e] = cm.reformat_addr(m.group(1))
        entry[cm.country_e] = data['country_code']
        entry[cm.city_e] = data['city']
        addr_list = []
        for key in ['store_mall_name', 'store_address', 'store_zip_code']:
            m = re.search(ur'%s="([^"]+)"' % key, s)
            if m is not None:
                tmp = cm.reformat_addr(m.group(1))
                if tmp != '':
                    addr_list.append(tmp)
        entry[cm.addr_e] = ', '.join(addr_list)
        m = re.search(ur'store_zip_code="([^"]+)"', s)
        if m is not None:
            entry[cm.zip_code] = m.group(1).strip()
        m = re.search(ur'store_telephone="([^"]+)"', s)
        if m is not None:
            entry[cm.tel] = m.group(1).strip()
        m = re.search(ur'store_fax="([^"]+)"', s)
        if m is not None:
            entry[cm.fax] = m.group(1).strip()
        m = re.search(ur'store_email="([^"]+)"', s)
        if m is not None:
            entry[cm.email] = m.group(1).strip()
        m = re.search(ur'store_latitude="([^"]+)"', s)
        if m is not None:
            entry[cm.lat] = string.atof(m.group(1).strip())
        m = re.search(ur'store_longitude="([^"]+)"', s)
        if m is not None:
            entry[cm.lng] = string.atof(m.group(1).strip())

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        gs.field_sense(entry)

        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
示例#8
0
文件: y3.py 项目: haizi-zh/firenze
def fetch_stores(data):
    """
    获得门店的详细信息
    :rtype : [entries]
    :param data:
    """
    try:
        html = cm.get_data(data['url'])
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    entries = []
    start = html.find(u'<ul class="store-list">')
    if start == -1:
        return entries
    start += len(u'<ul class="store-list">')
    end = html.find(u'</ul>', start)
    html = html[start:end]

    for m1 in re.findall(ur'<li class="(.*?)">(.*?)</li>', html, re.S):
        store = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        store[cm.store_type] = m1[0]
        sub_html = m1[1]
        m2 = re.findall(ur'<h3 class="store-name">(.*?)</h3>', sub_html)
        if len(m2) > 0:
            store[cm.name_e] = cm.reformat_addr(m2[0])
        m2 = re.findall(ur'<p class="store-address">(.*?)</p>', sub_html, re.S)
        if len(m2) > 0:
            store[cm.addr_e] = cm.reformat_addr(m2[0])

        cm.update_entry(store, {cm.continent_e: data[cm.continent_e].strip().upper(),
                                cm.country_e: data[cm.country_e].strip().upper(),
                                cm.city_e: data[cm.city_e].strip().upper()})

        entry = store
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

        print '%s: Found store: %s, %s (%s, %s)' % (
            brandname_e, store[cm.name_e], store[cm.addr_e], store[cm.country_e],
            store[cm.continent_e])
        db.insert_record(store, 'stores')
        entries.append(store)
示例#9
0
def fetch_stores(data):
    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    code = data['country_code']
    if gs.look_up(code, 1) is None:
        entry[cm.country_e] = cm.html2plain(data['country']).strip().upper()
    else:
        entry[cm.country_e] = code
    entry[cm.name_e] = data['store_name']
    entry[cm.city_e] = cm.extract_city(data['city'])[0]
    entry[cm.lat] = data['lat'] if data['lat'] is not None else ''
    entry[cm.lng] = data['lng'] if data['lng'] is not None else ''

    m = re.search(ur'data-boutique\s*=\s*"%s"' % data['store_id'], data['content'])
    sub = data['content'][m.end():]

    m1 = re.search(ur'<li class="isDistributeur[^<>]+>(.+?)</li>', sub)
    if m1 is not None:
        entry[cm.store_class] = cm.reformat_addr(m1.group(1))

    m1 = re.search(ur'<li class="place-title[^<>]+>(.+?)</li>', sub, re.S)
    if m1 is not None:
        entry[cm.addr_e] = cm.reformat_addr(m1.group(1))

    m1 = re.search(ur'<li class="contacts[^<>]+>(.+?)</li>', sub, re.S)
    if m1 is not None:
        m2 = re.search(ur'<a class="popupLaunch" href="([^"]+)"', m1.group(1))
        if m2:
            entry = fetch_details(data, m2.group(1), entry)

        m2 = re.search(ur'<p>(.+?)</p>', m1.group(1), re.S)
        if m2:
            ct_list = tuple(tmp.strip() for tmp in cm.reformat_addr(m2.group(1)).split(','))
            entry[cm.tel] = cm.extract_tel(ct_list[0])
            if len(ct_list) > 1:
                entry[cm.email] = ct_list[1].strip()

    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                        entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                        entry[cm.continent_e]), log_name)
    db.insert_record(entry, 'stores')

    return tuple(entry)
示例#10
0
def fetch_details(data):
    url = data[cm.url]
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    entry[cm.name_e] = data[cm.name_e]
    start = html.find(ur'<div class="field-address">')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<div\b', ur'</div>')
    if end == 0:
        return []
    m1 = re.search(ur'<div  class="locality">(.+?)</div>', sub)
    if m1 is not None:
        entry[cm.city_e] = cm.extract_city(m1.group(1))[0]
    m1 = re.search(ur'<div  class="postal-code">(.+?)</div>', sub)
    if m1 is not None:
        entry[cm.zip_code] = m1.group(1).strip()
    entry[cm.country_e] = data[cm.country_e]
    pat = re.compile(ur'<[^<>]+?>', re.S)
    entry[cm.addr_e] = cm.reformat_addr(re.sub(pat, u'\r\n', sub))

    m1 = re.search(ur'<div class="field-telephone"><a href=".+?" class="tel">(.+?)</a></div>', html)
    if m1 is not None:
        entry[cm.tel] = m1.group(1).strip()

    m1 = re.search(ur'<div class="field-opening-hours">\s*<p>(.+?)</p>\s*</div>', html, re.S)
    if m1 is not None:
        entry[cm.hours] = cm.reformat_addr(m1.group(1))

    m1 = re.search(ur'"coordinates":\[(-?\d+\.\d{4,})\s*,\s*(-?\d+\.\d{4,})\]', html)
    if m1 is not None:
        lat = string.atof(m1.group(1))
        lng = string.atof(m1.group(2))
        cm.update_entry(entry, {cm.lat: lat, cm.lng: lng})

    entry[cm.continent_e] = data[cm.continent_e]
    gs.field_sense(entry)
    print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                      entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                      entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return [entry]
示例#11
0
def get_store_details(data):
    url = data['url']
    try:
        html = cm.post_data(url, {'country': data['country_id'], 'city': data['city_id'], 'recordid': data['store_id']})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
    info = json.loads(html)['elements']
    addr = cm.reformat_addr(info['address'].replace('\\', '').replace('<p>', ',').replace('</p>', ','))
    # 第一行为商店名称
    terms = addr.split(',')
    if len(terms) > 0:
        entry[cm.name_e] = cm.reformat_addr(terms[0])
    entry[cm.addr_e] = addr

    gmap_url = info['gmap']
    m = re.findall(ur'(-?\d+\.\d+),(-?\d+\.\d+)', gmap_url)
    if len(m) > 0:
        cm.update_entry(entry, {cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1])})

    entry[cm.url] = info['shareurl'].replace('\\', '')
    entry[cm.hours] = info['openingtimes']
    entry[cm.comments] = info['other']

    # Geo
    country = data['country']
    city = data['city']
    cm.update_entry(entry, {cm.country_e: country, cm.city_e: city})
    entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    print '(%s / %d) Found store: %s, %s (%s, %s)' % (
        brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
        entry[cm.continent_e])

    db.insert_record(entry, 'stores')
    return entry
示例#12
0
def fetch_stores(data):
    url = data['home_url']
    try:
        body = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    raw = json.loads(body)['results']
    store_list = []
    for key in raw:
        store = raw[key]
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.name_e] = store['post_title']
        entry[cm.url] = store['post_permalink'].replace(u'\\', '')
        entry[cm.country_e] = store['country'].strip().upper()
        entry[cm.city_e] = store['city'].strip().upper()

        if '_yoox_store_latlong' in store:
            m = re.findall(ur'-?\d+\.\d+', store['_yoox_store_latlong'])
            if len(m) == 2:
                entry[cm.lat] = string.atof(m[0])
                entry[cm.lng] = string.atof(m[1])

        if 'store_phone' in store:
            entry[cm.tel] = store['store_phone'].replace('P:', '').replace('T:', '') \
                .replace('P', '').replace('T', '').strip()
        if 'store_email' in store:
            entry[cm.email] = store['store_email']
        if 'store_fax' in store:
            entry[cm.fax] = store['store_fax'].replace('F:', '').replace('F', '').strip()
        if 'store_hours' in store:
            entry[cm.hours] = cm.reformat_addr(store['store_hours'])
        if 'store_address' in store:
            entry[cm.addr_e] = cm.reformat_addr(store['store_address'])
        if 'women' in store and 'men' in store:
            entry[cm.store_type] = 'Women: %s, men: %s' % (', '.join(store['women']), ', '.join(store['men']))

        gs.field_sense(entry)
        print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                          entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                          entry[cm.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    return store_list
示例#13
0
文件: etro.py 项目: haizi-zh/firenze
def fetch_store_details(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    ret = gs.look_up(data['country'], 1)
    if ret is not None:
        entry[cm.country_e] = ret['name_e']
    m = re.search(ur'<span class="type">Address</span>\s*<p>(.+?)</p>', body, re.S)
    if m is not None:
        addr = cm.reformat_addr(m.group(1))
        country, province, city = gs.addr_sense(addr)
        if country is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = country
        if province is not None:
            entry[cm.province_e] = province
        if city is not None:
            entry[cm.city_e] = city
        entry[cm.addr_e] = addr

    m = re.search(ur'<span class="type">Phone</span>\s*<p>(.+?)</p>', body, re.S)
    if m is not None:
        entry[cm.tel] = m.group(1)

    m = re.search(ur'<span class="type">Opening hours</span>\s*<p>(.+?)</p>', body, re.S)
    if m is not None:
        entry[cm.hours] = cm.reformat_addr(m.group(1))

    m = re.search(ur'<span class="type">You can find</span>\s*<p>(.+?)</p>', body, re.S)
    if m is not None:
        entry[cm.store_type] = cm.reformat_addr(m.group(1))

    m = re.search(ur'google.maps.LatLng\(\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)', body, re.S)
    entry[cm.lat]=string.atof(m.group(1))
    entry[cm.lng]=string.atof(m.group(2))

    gs.field_sense(entry)
    print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                      entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                      entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return [entry]
示例#14
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.post_data(url, {'rsp': 'json', 'country': data['country_code']})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    raw = json.loads(body)
    store_list = []
    for s in raw['stores']:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.name_e] = cm.html2plain(s['name']).strip()

        addr_list = []
        for key in ['address1', 'address2']:
            if s[key].strip() != '':
                addr_list.append(cm.reformat_addr(s[key]))
        entry[cm.addr_e] = ' '.join(addr_list)

        # r=s['region'].strip().upper()
        # m = re.search(ur'\b([A-Z]{2})\b', r)
        # if data[cm.country_e]=='UNITED STATES' and m is not None:
        #     # 美国
        #     ret = gs.look_up(m.group(1), 2)
        #     if ret is not None:
        #         r = ret['name_e']
        # entry[cm.province_e] = r

        entry[cm.city_e] = cm.extract_city(s['city'])[0]
        entry[cm.zip_code] = s['zip'].strip()
        entry[cm.country_e] = data[cm.country_e]
        entry[cm.lat] = string.atof(s['lat'])
        entry[cm.lng] = string.atof(s['lng'])
        entry[cm.tel] = s['phone'].strip()
        entry[cm.fax] = s['fax'].strip()
        entry[cm.email] = s['emailaddress'].strip()
        entry[cm.url] = s['website'].strip()

        days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
        opening = []
        if 'openingHours' in s and s['openingHours'] is not None:
            for m in re.finditer(ur'i:(\d);s:\d+:\\?"([^\\"]+?)\\?"', s['openingHours']):
                opening.append('%s: %s' % (days[string.atoi(m.group(1))], m.group(2).strip()))
            entry[cm.hours] = ', '.join(opening)

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        print '(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'],
                                                              entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e],
                                                              entry[cm.country_e], entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
示例#15
0
def fetch_stores(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.findall(ur'var markerContent\s*?=\s*?"(.+?)".+?'
                        ur'createMarker\(.+?new google.maps.LatLng\((-?\d+\.\d+),(-?\d+\.\d+)\)', html, re.S):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        lat, lng = map(string.atof, [m[1], m[2]])
        cm.update_entry(entry, {cm.lat: lat, cm.lng: lng})

        sub = m[0].strip()
        m1 = re.search(ur'<b>(.+?)</b>', sub)
        if m1 is None:
            continue
        entry[cm.name_c] = m1.group(1)
        sub = sub.replace(m1.group(0), '')
        m1=re.search(ur'聯系電話(?::|:)(.+?)<', sub)
        if m1 is not None:
            entry[cm.tel]=m1.group(1)
            sub=sub.replace(m1.group(0), '<')
        sub = re.sub(ur'<img\b.*?/>', '', sub)
        entry[cm.addr_c] = cm.reformat_addr(sub)

        print '(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                        entry[cm.name_c], entry[cm.addr_e], entry[cm.country_e],
                                                        entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
示例#16
0
文件: folli.py 项目: haizi-zh/firenze
def fetch_store_details(data):
    url = data['url']
    try:
        body = cm.post_data(url, {'cCode': data['country_code'], 'city': data['city'], 'postsearch': 1})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    start = body.find('<div class="store_locator')
    if start == -1:
        print 'Failed processing %s' % url
        return []
    sub, start, end = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    m = re.search(ur'<p><span class="bold">Address</span>(.+?)</p>', sub, re.S)
    if m is not None:
        addr_list = cm.reformat_addr(m.group(1)).split(', ')
        ret = cm.extract_tel(addr_list[-1])
        if ret != '':
            entry[cm.tel] = ret
            del addr_list[-1]
        entry[cm.addr_e] = ', '.join(addr_list)

    addr_text=sub[m.end():]
    m = re.search(ur'<div class="title locator">', addr_text)
    if m is not None:
        tmp = cm.extract_closure(addr_text[m.start():], ur'<div\b', ur'</div>')[0]
        m1 = re.search(ur'<h2>(.+?)</h2>', tmp, re.S)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1))

    m = re.search(ur'google.maps.LatLng\(\s*(-?\d+\.\d+),\s*(-?\d+\.\d+)', body, re.S)
    if m is not None:
        entry[cm.lat] = string.atof(m.group(1))
        entry[cm.lng] = string.atof(m.group(2))

    entry[cm.country_e] = data['country_code']
    entry[cm.city_e] = data['city']
    gs.field_sense(entry)
    print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                      entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                      entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return [entry]
示例#17
0
def fetch_stores(data):
    url = data["host"] + data["country_url"] % data["country_id"]
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump("Error in fetching countries: %s" % url, log_name)
        return []

    raw = json.loads(body)["rawPos"]
    store_list = []
    for s in raw:
        entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"])
        addr_list = []
        for tmp2 in [cm.html2plain(s[tmp1]).strip() for tmp1 in ["address%d" % v for v in xrange(1, 5)]]:
            if tmp2 != "":
                addr_list.append(tmp2)
        entry[cm.addr_e] = ", ".join(addr_list)
        entry[cm.city_e] = cm.extract_city(s["city"]["name"])[0]
        entry[cm.country_e] = s["country"]["countryCode"]
        entry[cm.email] = s["email"]
        entry[cm.fax] = s["fax"]
        if s["latitude"] != "":
            entry[cm.lat] = string.atof(s["latitude"])
        if s["longitude"] != "":
            entry[cm.lng] = string.atof(s["longitude"])
        entry[cm.hours] = cm.reformat_addr(s["openingSchedule"])
        phone_list = []
        for key in ["phone1", "phone2"]:
            if s[key].strip() != "":
                phone_list.append(s[key].strip())
        entry[cm.tel] = ", ".join(phone_list)
        entry[cm.zip_code] = s["postalCode"]
        entry[cm.name_e] = s["shopName"]
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == "":
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == "":
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        cm.dump(
            "(%s / %d) Found store: %s, %s (%s, %s)"
            % (
                data["brandname_e"],
                data["brand_id"],
                entry[cm.name_e],
                entry[cm.addr_e],
                entry[cm.country_e],
                entry[cm.continent_e],
            ),
            log_name,
        )
        db.insert_record(entry, "stores")
        store_list.append(entry)

    return store_list
示例#18
0
def fetch_stores(data):
    # country=Greece&city=ATHENS&adutl=+01&kids=+02&undercolor=+06&togetmap=mapdata
    url = data['data_url']
    param = {'country': data['country'], 'city': data['city'], 'adutl': ' 01', 'kids': ' 02',
             'undercolor': ' 06', 'togetmap': 'mapdata'}
    try:
        body = cm.get_data(url, param)
    except Exception:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), 'benetton_log.txt', False)
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.findall(ur'<marker (.+?)>', body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        m1 = re.search(ur'name=\\"(.+?)\\"', m)
        if m1 is not None:
            entry[cm.name_e] = cm.html2plain(m1.group(1).strip().replace(u'\\', ''))
        m1 = re.search(ur'address=\\"(.+?)\\"', m)
        if m1 is not None:
            addr = cm.reformat_addr(cm.html2plain(m1.group(1)).replace(u'\\', ''))
            tel = cm.extract_tel(addr)
            if tel != '':
                entry[cm.tel] = tel
                addr = addr.replace(tel, '')
            entry[cm.addr_e] = cm.reformat_addr(addr)

        m1 = re.search(ur'lat=\\"(.+?)\\"', m)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))

        m1 = re.search(ur'lng=\\"(.+?)\\"', m)
        if m1 is not None:
            entry[cm.lng] = string.atof(m1.group(1))

        entry[cm.country_e] = data['country'].strip().upper()
        entry[cm.city_e] = cm.extract_city(data['city'])[0]
        gs.field_sense(entry)
        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), 'benetton_log.txt', False)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
示例#19
0
文件: ysl.py 项目: haizi-zh/firenze
    def f(m):
        store_name = m[0].strip()
        addr_str = m[1].strip()

        spl = addr_str.split('<br/>')
        store_type = cm.html2plain(spl[0].strip())

        store_addr = spl[1].strip()
        hour_idx = 2
        store_tel = ''
        for i in xrange(2, len(spl)):
            # If this is not a phone number:
            tel = cm.extract_tel(spl[i])
            if tel == '':
                store_addr += ', ' + spl[i]
                hour_idx = i + 1
            else:
                store_tel = spl[i].strip()
                hour_idx = i + 1
                break

        if hour_idx < len(spl):
            store_hour = cm.html2plain(', '.join(spl[hour_idx:])).strip()
        else:
            store_hour = ''

        # store_addr = cm.reformat_addr('\r\n'.join([val.strip() for val in spl[1:-3]]))
        store_addr = cm.reformat_addr(store_addr)

        store_entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        cm.update_entry(store_entry,
                        {cm.continent_e: opt[cm.continent_e].strip().upper(), cm.city_e: opt[cm.city_e].strip().upper(),
                         cm.country_e: opt[cm.country_e].strip().upper(),
                         cm.name_e: cm.name_e, cm.addr_e: store_addr, cm.store_type: store_type, cm.hours: store_hour,
                         cm.tel: store_tel})
        if opt.has_key(cm.province_e):
            store_entry[cm.province_e] = opt[cm.province_e]
        else:
            store_entry[cm.province_e] = ''
        store_entry[cm.city_e] = cm.extract_city(store_entry[cm.city_e])[0]

        gs.field_sense(store_entry)
        ret = gs.addr_sense(store_entry[cm.addr_e], store_entry[cm.country_e])
        if ret[1] is not None and store_entry[cm.province_e] == '':
            store_entry[cm.province_e] = ret[1]
        if ret[2] is not None and store_entry[cm.city_e] == '':
            store_entry[cm.city_e] = ret[2]
        gs.field_sense(store_entry)

        print '%s Found store: %s, %s (%s, %s)' % (
            brandname_e, store_entry[cm.name_e], store_entry[cm.addr_e], store_entry[cm.country_e],
            store_entry[cm.continent_e])
        db.insert_record(store_entry, 'stores')

        return store_entry
示例#20
0
文件: kenzo.py 项目: haizi-zh/firenze
def fetch(level=1, data=None, user='******', passwd=''):
    db = cm.StoresDb()
    db.connect_db(user=user, passwd=passwd)
    db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id))

    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    js = json.loads(html)
    store_list = []
    for s in js['data']['list']:
        entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        cm.update_entry(entry, {cm.lat: string.atof(s['geo']['lat']),
                                cm.lng: string.atof(s['geo']['lng'])})
        entry[cm.name_e] = s['contact']['title']
        entry[cm.addr_e] = cm.reformat_addr(s['contact']['address'])
        entry[cm.tel] = s['contact']['phone']
        entry[cm.fax] = s['contact']['fax']
        entry[cm.hours] = cm.reformat_addr(s['contact']['hours'])
        entry[cm.store_type]=s['contact']['selling']
        entry[cm.url]=host+s['link']

        gs.update_city_map(s['city'], s['country'], s['continent'])
        cm.update_entry(entry,{cm.continent_e:s['continent'], cm.country_e:s['country'],
                               cm.city_e:s['city']})
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s)' % (
            brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
            entry[cm.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    db.disconnect_db()
    gs.commit_maps(1)
    gs.commit_maps(3)
    return store_list
示例#21
0
文件: prada.py 项目: haizi-zh/firenze
def fetch_store_details(data):
    url = data['host'] + data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching store details: %s' % url, log_name)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    start = body.find(ur'<h3>available in store</h3>')
    if start != -1:
        type_sub = cm.extract_closure(body[start:], ur'<ul\b', ur'</ul>')[0]
        entry[cm.store_type] = ', '.join(
            cm.html2plain(tmp).strip() for tmp in re.findall(ur'<li[^<>]*>(.+?)</li>', type_sub, re.S))

    start = body.find(ur"<div class='gmap_info_box'")
    if start == -1:
        cm.dump('Error in fetching store details: %s' % url, log_name)
        return []
    body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]

    raw = json.loads(cm.extract_closure(body, ur'\{', ur'\}')[0])['table']
    entry[cm.name_e] = cm.html2plain(raw['name'])
    entry[cm.city_e] = data['city'].strip().upper()
    entry[cm.country_e] = data['country'].strip().upper()
    # entry[cm.store_type] = data['store_type']
    entry[cm.addr_e] = cm.reformat_addr(raw['address'])
    m = re.search(re.compile(ur'phone:(.*?)fax:(.*?)', re.I | re.S), raw['phone'])
    if m is not None:
        entry[cm.tel] = m.group(1).strip()
        entry[cm.fax] = m.group(2).strip()
    else:
        m = re.search(re.compile(ur'phone:(.*?)', re.I | re.S), raw['phone'])
        if m is not None:
            entry[cm.tel] = m.group(1).strip()
        m = re.search(re.compile(ur'fax:(.*?)', re.I | re.S), raw['phone'])
        if m is not None:
            entry[cm.fax] = m.group(1).strip()
    entry[cm.hours] = raw['hours']
    if raw['lat'] is not None and raw['lat'] != '':
        entry[cm.lat] = string.atof(raw['lat'])
    if raw['lng'] is not None and raw['lng'] != '':
        entry[cm.lat] = string.atof(raw['lng'])
    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None:
        entry[cm.province_e] = ret[1]
        gs.field_sense(entry)

    cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                        entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                        entry[cm.continent_e]), log_name)
    db.insert_record(entry, 'stores')
    return [entry]
示例#22
0
文件: dkny.py 项目: haizi-zh/firenze
def fetch_stores(data):
    url = data['data_url']
    param = {'output': 'json', 'country': data['country_code'], 'brand': 'dkny'}
    page = 0
    tot_page = -1
    store_list = []
    while True:
        page += 1
        if tot_page != -1 and page > tot_page:
            break
        param['p'] = page
        try:
            body = cm.get_data(url, param)
        except Exception, e:
            cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
            return ()

        raw = json.loads(body)
        tot_page = raw['Stores']['TotalPages']
        if data['country_code'] not in region_map:
            # 构造州列表
            region_map[data['country_code']] = dict((item['RegionId'], item['Name']) for item in raw['Regions'])

        for s in raw['Stores']['Items']:
            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
            entry[cm.country_e] = data['country_code'].upper()
            entry[cm.city_e] = cm.extract_city(s['City'])[0]
            entry[cm.name_e] = cm.html2plain(s['Name']).strip()
            entry[cm.addr_e] = cm.reformat_addr(s['Address'])
            entry[cm.tel] = s['Phone'].strip() if s['Phone'] else ''
            entry[cm.fax] = s['Fax'].strip() if s['Fax'] else ''
            entry[cm.email] = s['Email'].strip() if s['Email'] else ''
            entry[cm.lat] = s['Latitude'] if s['Latitude'] else ''
            entry[cm.lng] = s['Longitude'] if s['Longitude'] else ''
            region_id = s['RegionId']
            if region_id in region_map[data['country_code']]:
                entry[cm.province_e] = cm.html2plain(region_map[data['country_code']][region_id]).strip().upper()

            gs.field_sense(entry)
            ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
            if ret[1] is not None and entry[cm.province_e] == '':
                entry[cm.province_e] = ret[1]
            if ret[2] is not None and entry[cm.city_e] == '':
                entry[cm.city_e] = ret[2]
            gs.field_sense(entry)

            cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                                entry[cm.continent_e]), log_name)
            db.insert_record(entry, 'stores')
            store_list.append(entry)
示例#23
0
def fetch_stores(data):
    url = data['post_shops']
    param = {'city': data['city_e'], 'paulandjoe_women': 0, 'paulandjoe_man': 0,
             'paulandjoe_sister': 0, 'paulandjoe_little': 0, 'paulandjoe_beauty': 0}
    try:
        html = cm.post_data(url, param)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    try:
        for store in (pq(tmp) for tmp in pq(html)('ul')):
            try:
                entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
                entry[cm.name_e] = cm.html2plain(store('li.first')[0].text).strip()
                entry[cm.country_e] = data[cm.country_e]
                entry[cm.city_e] = data[cm.city_e]

                addr_list = []
                for term in (cm.reformat_addr(unicode(pq(tmp))) for tmp in store('li[class!="first"]')):
                    if term != '':
                        addr_list.append(term)
                tel = cm.extract_tel(addr_list[-1])
                if tel != '':
                    entry[cm.tel] = tel
                    del addr_list[-1]
                entry[cm.addr_e] = ', '.join(addr_list)

                gs.field_sense(entry)
                ret = gs.addr_sense(entry[cm.addr_e])
                if ret[0] is not None and entry[cm.country_e] == '':
                    entry[cm.country_e] = ret[0]
                if ret[1] is not None and entry[cm.province_e] == '':
                    entry[cm.province_e] = ret[1]
                if ret[2] is not None and entry[cm.city_e] == '':
                    entry[cm.city_e] = ret[2]
                gs.field_sense(entry)
                print '(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                                entry[cm.continent_e])
                store_list.append(entry)
                db.insert_record(entry, 'stores')
            except (IndexError, TypeError) as e:
                cm.dump(u'Error in parsing %s, %s' % (url, param), log_name)
                print traceback.format_exc()
                continue
    except Exception, e:
        print traceback.format_exc()
示例#24
0
def parse_store(data, body=None):
    if body is None:
        url = data['url']
        try:
            body = cm.post_data(url)
        except Exception:
            cm.dump('Error in fetching stores: %s' % url, log_name)
            return []

    start = body.find(ur'jQuery.extend(Drupal.settings,')
    latlng_map = {}
    if start != -1:
        for item in json.loads(cm.extract_closure(body[start:], ur'\{', ur'\}')[0])['getlocations']['key_1']['latlons']:
            latlng_map[cm.reformat_addr(item[3])] = {'lat': string.atof(item[0]), 'lng': string.atof(item[1])}
示例#25
0
def fetch_stores(data):
    url = data['store_url']
    try:
        body = cm.get_data(url, {'country': data['country'], 'city': data['city']})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    raw = json.loads(body)
    store_list = []

    for item in raw['items']:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['country'].strip().upper()
        tmp = cm.extract_city(data['city'])[0]
        if entry[cm.country_e] == 'USA':
            entry[cm.province_e] = tmp
        else:
            entry[cm.city_e] = tmp
        gs.field_sense(entry)

        addr = cm.reformat_addr(item['address'].replace(u'\\', ''))
        addr_list = [tmp.strip() for tmp in addr.split(',')]
        tel = cm.extract_tel(addr_list[-1])
        if tel !='':
            entry[cm.tel]=tel
            del addr_list[-1]
        entry[cm.addr_e]=', '.join(addr_list)
        entry[cm.store_type] = item['shop_type']

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                          entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                          entry[cm.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    return store_list
示例#26
0
def fetch_stores(data):
    url = data['host'] + data['store_url']
    param = {'CC': data['country_code'], 'City': data['city']}
    try:
        body = cm.get_data(url, param)
    except Exception:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
        return []

    store_list = []
    # pat_tel = re.compile(ur'tel:\s*', re.I)
    # pat_fax = re.compile(ur'fax:\s*', re.I)
    # pat_email = re.compile(ur'email:\s*', re.I)

    pat_tel = re.compile(ur'tel:\s*(.+?)(?=(?:tel|fax|email|$))', re.I | re.S)
    pat_fax = re.compile(ur'fax:\s*(.+?)(?=(?:tel|fax|email|$))', re.I | re.S)
    pat_email = re.compile(ur'email:\s*(.+?)(?=(?:tel|fax|email|$))', re.I | re.S)

    for m in re.finditer(ur'<div class="store-info">', body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['country_code']
        entry[cm.city_e] = cm.extract_city(data['city'])[0]

        sub = cm.extract_closure(body[m.start():], ur'<div\b', ur'</div>')[0]
        m1 = re.search(ur'<h2 class="store-name[^"]*">(.+?)</h2>', sub)
        if m1 is not None:
            entry[cm.name_e] = cm.reformat_addr(m1.group(1))
            entry[cm.store_class] = entry[cm.name_e]
        m1 = re.search(ur'<dt class="address"', sub)
        if m1 is not None:
            tmp = cm.reformat_addr(cm.extract_closure(sub[m1.end():], ur'<dd>', ur'</dd>')[0])
            entry[cm.addr_e] = tmp
            if len(tmp) > 1:
                m1 = re.search(ur'[\d\-]{4,}', tmp.split(',')[-2])
                if m1 is not None and len(re.findall(ur'\d', m1.group())) >= 4:
                    entry[cm.zip_code] = m1.group().strip()
示例#27
0
文件: mango.py 项目: haizi-zh/firenze
def fetch_stores(data):
    url = data['store_url']
    param = {'myid': data['key'], 'idioma': 'in'}
    try:
        body = cm.get_data(url, param)
    except Exception:
        cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name)
        return []

    store_list = []
    for s in json.loads(body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['country']
        entry[cm.city_e] = cm.extract_city(data['city'])[0]
        entry[cm.name_e] = cm.reformat_addr(s['title'])

        m = re.search(ur'(.+?)-\s*<', s['key'])
        addr_list = [entry[cm.name_e]]
        if m is not None:
            m1 = re.search(ur'-+', m.group(1))
            if m1 is not None:
                tmp = [m.group(1)[:m1.start()], m.group(1)[m1.end():]]
            else:
                tmp = [m.group(1)]
            if len(tmp) > 1:
                entry[cm.tel] = cm.extract_tel(tmp[1])
            m1 = re.search(ur'\d{4,}', tmp[0])
            if m1 is not None:
                entry[cm.zip_code] = m1.group()
            addr_list.append(tmp[0].strip())
        entry[cm.addr_e] = ', '.join(addr_list)

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    return store_list
示例#28
0
文件: omega.py 项目: haizi-zh/firenze
def fetch_store_details(data):
    url = '%s/%d' % (data['url'], data['store_id'])
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching countries: %s' % url, log_name)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    m = re.search(ur'<h1 class="with-back-option">\s*([^<>]+)\s*[<>]', body)
    if m is not None:
        entry[cm.name_e] = m.group(1).strip()

    start = body.find(ur'<div class="store-details">')
    if start != -1:
        sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]
        addr = cm.extract_closure(sub, ur'<p\b', ur'</p>')[0]
        m = re.search(ur'<span class="locality">([^<>]+?)</span>', addr)
        if m is not None:
            entry[cm.city_e] = m.group(1).split(',')[0].strip().upper()
        m = re.search(ur'<span class="postal-code">([^<>]+?)</span>', addr)
        if m is not None:
            entry[cm.zip_code] = m.group(1).strip()
        m = re.search(ur'<span class="country-name">([^<>]+?)</span>', addr)
        if m is not None:
            entry[cm.country_e] = m.group(1).strip().upper()
        entry[cm.addr_e] = cm.reformat_addr(addr)

    start = body.find(ur'<div class="contact">')
    if start != -1:
        sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]
        m = re.search(ur'<span class="tel">(.+?)</span>', sub)
        if m is not None:
            entry[cm.tel] = m.group(1).strip()
        m = re.search(ur'<span class="fax">(.+?)</span>', sub)
        if m is not None:
            entry[cm.fax] = m.group(1).strip()
        m = re.search(ur'<a href="mailto:([^"]+)">Email</a>', sub)
        if m is not None:
            entry[cm.email] = m.group(1).strip()

    start = body.find(ur'<h3>Opening hours</h3>')
    if start != -1:
        tmp = []
        sub = cm.extract_closure(body[start:], ur'<table>', ur'</table>')[0]
        for m in re.findall(ur'<t[hd][^<>]*>([^<>]+)</t[hd]>', sub):
            tmp.append(m)
        entry[cm.hours] = ' '.join(tmp)
示例#29
0
def fetch_stores(data):
    """
    获得商店信息
    :param data:
    """
    url = data["post_url"]
    try:
        html = cm.post_data(url, {"pid": data["city_id"], "lang": "en", "action": "popola_box_DX"})
    except Exception:
        print "Error occured in getting city list: %s" % url
        dump_data = {"level": 2, "time": cm.format_time(), "data": {"url": url}, "brand_id": data["brand_id"]}
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.finditer(ur'<a href="(.+?)".*?>', html):
        entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"])
        entry[cm.url] = m.group(1)
        store_html, start, end = cm.extract_closure(html[m.start() :], ur"<a href", ur"</a>")
        if end == 0:
            continue
        m1 = re.findall(ur'<h3 class="titleShop">(.+?)</h3>', store_html, re.S)
        if len(m1) > 0:
            entry[cm.name_e] = m1[0].strip()
        m1 = re.findall(ur"<p\b.*?>(.+?)(?:</p>|</div>)", store_html, re.S)
        if len(m1) > 0:
            terms = cm.reformat_addr(m1[0]).split(",")
            tel = cm.extract_tel(terms[-1])
            if tel != "":
                terms = terms[:-1]
                entry[cm.tel] = tel
            entry[cm.addr_e] = ", ".join([v.strip() for v in terms])

        entry["country_e"] = data["country_e"]
        entry["city_e"] = data["city_e"]
        gs.field_sense(entry)

        print "(%s / %d) Found store: %s, %s (%s, %s)" % (
            data["brandname_e"],
            data["brand_id"],
            entry[cm.name_e],
            entry[cm.addr_e],
            entry[cm.country_e],
            entry[cm.continent_e],
        )
        store_list.append(entry)
        db.insert_record(entry, "stores")
示例#30
0
文件: mido.py 项目: haizi-zh/firenze
def fetch_stores(data):
    """
    商店列表
    :param data:
    """
    html = data['html']

    store_list = []
    while True:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        m = re.search(ur'<li class="leaf end"><div><u>(.+?)</u>', html)
        if m is None:
            break
        html = html[m.start():]
        entry[cm.name_e] = m.group(1)

        sub, start, end = cm.extract_closure(html, ur'<li\b', '</li>')
        html = html[end:]

        # 单个商店的页面
        sub = sub[len(m.group(0)):-len('</li>')]

        m = re.search(ur'<a href="(http.+?)"', sub)
        if m is not None:
            entry[cm.url] = m.group(1)
        m = re.search(ur'<a href="mailto:(.+?)"', sub)
        if m is not None:
            entry[cm.email] = m.group(1)
        m = re.search(ur'(?:<a\b|</div>)', sub)
        if m is not None:
            addr = sub[:m.start()]
        else:
            addr = sub
            # 解析地址栏
        addr = cm.reformat_addr(addr)
        terms = addr.split(',')
        new_terms = []
        for t in terms:
            if re.search(ur'phone', t, re.IGNORECASE) is not None:
                entry[cm.tel] = cm.extract_tel(t)
            elif re.search(ur'fax', t, re.IGNORECASE) is not None:
                entry[cm.fax] = cm.extract_tel(t)
            elif data['city_e'] in t.strip().upper():
                # 邮编
                m = re.search(ur'\d+', t)
                if m is not None:
                    entry[cm.zip_code] = m.group(0)
示例#31
0
            if entry[cm.lat] == 0 and entry[cm.lng] == 0:
                entry[cm.lat], entry[cm.lng] = '', ''

            item = pq(store)
            tmp = item('h1')
            entry[cm.name_e] = cm.html2plain(
                tmp[0].text).strip() if len(tmp) > 0 and tmp[0].text else ''

            tmp = item('dd.location')
            tmp = tmp[0].text if len(tmp) > 0 and tmp[0].text else ''
            entry[cm.city_e] = cm.extract_city(tmp)[0]

            tmp = item('dd.street')
            tmp = tmp[0].text if len(tmp) > 0 and tmp[0].text else ''
            entry[cm.addr_e] = cm.reformat_addr(tmp)

            tmp = item('dd.phone')
            tmp = tmp[0].text if len(tmp) > 0 and tmp[0].text else ''
            entry[cm.tel] = tmp.strip()

            tmp = item('dd.hours')
            tmp = tmp[0].text if len(tmp) > 0 and tmp[0].text else ''
            entry[cm.hours] = tmp.strip()

            tmp = item('dd.products')
            tmp = tmp[0].text if len(tmp) > 0 and tmp[0].text else ''
            entry[cm.store_type] = tmp.strip()

            gs.field_sense(entry)
            ret = gs.addr_sense(entry[cm.addr_e])
示例#32
0
def fetch(level=1, data=None, host='localhost', port=3306, user='******', passwd='123456'):
    tot = 0
    start = 0
    store_list = []
    data = {'q': '*:*', 'pt': '0,0', 'd': 100000, 'start': 0, 'rows': 100}
    # data = {'q': '*:*', 'pt': '36.778261,-119.417932', 'd': 50, 'start': 0, 'rows': 100}

    db = cm.StoresDb()
    db.connect_db(host=host, port=port, user=user, passwd=passwd, db='brand_stores')
    db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id))

    while True:
        cm.dump('Fetching from %d' % start, 'triumph_log.txt')
        try:
            data['start'] = start
            html = cm.get_data(url, data)
            raw_list = json.loads(html)
            if tot == 0:
                tot = raw_list['response']['numFound']
                cm.dump('Found: %d' % tot, 'triumph_log.txt')
            raw_list = raw_list['response']['docs']
        except Exception:
            cm.dump('Error occured while fetching from %d' % data['start'], 'triumph_log.txt')
            dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
            cm.dump(dump_data)
            return []

        idx = 0
        if len(raw_list) < data['rows'] and start + len(raw_list) < tot:
            cm.dump('Cooling down...', 'triumph_log.txt')
            time.sleep(5)
            continue

        for v in raw_list:
            entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
            cm.update_entry(entry, {cm.store_type: v['class'],
                                    cm.zip_code: v['zip'], cm.tel: v['phone'], cm.fax: v['fax'],
                                    cm.url: v['web'], cm.email: v['email'], cm.hours: v['opening_hours']})
            entry[cm.name_e] = cm.reformat_addr(v['name'])

            entry[cm.city_e], tmp = cm.extract_city(v['city'])
            if not re.search(ur'\d', entry[cm.zip_code]) and tmp != '':
                entry[cm.zip_code] = tmp

            if v['location'] != '':
                terms = v['location'].split(',')
                cm.update_entry(entry, {cm.lat: string.atof(terms[0]), cm.lng: string.atof(terms[1])})
            addr = v['address']
            if v['address2'] != '':
                addr += ', ' + v['address2']
            entry[cm.addr_e] = cm.reformat_addr(addr)
            ret = gs.look_up(v['country'], 1)
            if ret is not None:
                entry[cm.country_e] = ret['name_e']
            else:
                cm.dump('Error in looking up country %s' % v['country'], 'triumph_log.txt')
            gs.field_sense(entry)

            cm.dump('(%s / %d) Found store at %d: %s, %s (%s, %s, %s)' % (
                brandname_e, brand_id, start + idx, entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e],
                entry[cm.country_e],
                entry[cm.continent_e]), 'triumph_log.txt')
            store_list.append(entry)
            db.insert_record(entry, 'stores')
            idx += 1

        if tot - start <= len(raw_list):
            break
        else:
            start += len(raw_list)
示例#33
0
def fetch_stores(data):
    url = data['store_url']
    param = {
        'store_country': data['country_code'],
        'store_city': data['city_code']
    }
    try:
        body = cm.post_data(url, param)
    except Exception:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
        return []

    store_list = []
    for s in re.findall(ur'<marker\b([^<>]+)/\s*>', body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        m = re.search(ur'store_name="([^"]+)"', s)
        if m is not None:
            entry[cm.name_e] = cm.reformat_addr(m.group(1))
        entry[cm.country_e] = data['country_code']
        entry[cm.city_e] = data['city']
        addr_list = []
        for key in ['store_mall_name', 'store_address', 'store_zip_code']:
            m = re.search(ur'%s="([^"]+)"' % key, s)
            if m is not None:
                tmp = cm.reformat_addr(m.group(1))
                if tmp != '':
                    addr_list.append(tmp)
        entry[cm.addr_e] = ', '.join(addr_list)
        m = re.search(ur'store_zip_code="([^"]+)"', s)
        if m is not None:
            entry[cm.zip_code] = m.group(1).strip()
        m = re.search(ur'store_telephone="([^"]+)"', s)
        if m is not None:
            entry[cm.tel] = m.group(1).strip()
        m = re.search(ur'store_fax="([^"]+)"', s)
        if m is not None:
            entry[cm.fax] = m.group(1).strip()
        m = re.search(ur'store_email="([^"]+)"', s)
        if m is not None:
            entry[cm.email] = m.group(1).strip()
        m = re.search(ur'store_latitude="([^"]+)"', s)
        if m is not None:
            entry[cm.lat] = string.atof(m.group(1).strip())
        m = re.search(ur'store_longitude="([^"]+)"', s)
        if m is not None:
            entry[cm.lng] = string.atof(m.group(1).strip())

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        gs.field_sense(entry)

        cm.dump(
            '(%s / %d) Found store: %s, %s (%s, %s)' %
            (data['brandname_e'], data['brand_id'], entry[cm.name_e],
             entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]),
            log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
示例#34
0
def fetch_store_details(data):
    url = data['url']
    try:
        body = cm.post_data(url, {
            'cCode': data['country_code'],
            'city': data['city'],
            'postsearch': 1
        })
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    start = body.find('<div class="store_locator')
    if start == -1:
        print 'Failed processing %s' % url
        return []
    sub, start, end = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                data['brandname_c'])
    m = re.search(ur'<p><span class="bold">Address</span>(.+?)</p>', sub, re.S)
    if m is not None:
        addr_list = cm.reformat_addr(m.group(1)).split(', ')
        ret = cm.extract_tel(addr_list[-1])
        if ret != '':
            entry[cm.tel] = ret
            del addr_list[-1]
        entry[cm.addr_e] = ', '.join(addr_list)

    addr_text = sub[m.end():]
    m = re.search(ur'<div class="title locator">', addr_text)
    if m is not None:
        tmp = cm.extract_closure(addr_text[m.start():], ur'<div\b',
                                 ur'</div>')[0]
        m1 = re.search(ur'<h2>(.+?)</h2>', tmp, re.S)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1))

    m = re.search(ur'google.maps.LatLng\(\s*(-?\d+\.\d+),\s*(-?\d+\.\d+)',
                  body, re.S)
    if m is not None:
        entry[cm.lat] = string.atof(m.group(1))
        entry[cm.lng] = string.atof(m.group(2))

    entry[cm.country_e] = data['country_code']
    entry[cm.city_e] = data['city']
    gs.field_sense(entry)
    print '(%s / %d) Found store: %s, %s (%s, %s)' % (
        data['brandname_e'], data['brand_id'], entry[cm.name_e],
        entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return [entry]
示例#35
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    raw = json.loads(body)
    store_list = []
    for s in raw:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.name_e] = s['storename']
        entry[cm.addr_e] = cm.reformat_addr(', '.join([s['building'].replace(u'operated by ', u''),
                                                       s['street'].strip()]))

        if s['country'] is not None:
            entry[cm.country_e] = s['country'].strip().upper()
        if s['city'] is not None:
            if s['country'].strip() == u'US':
                tmp = s['city'].split(',')
                entry[cm.city_e] = tmp[0].strip().upper()
                if len(tmp) > 1:
                    entry[cm.province_e] = tmp[1].strip().upper()
            else:
                entry[cm.city_e] = s['city'].strip().upper()
            entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

        if s['zip'] is not None:
            entry[cm.zip_code] = s['zip'].strip()
        if s['phone'] is not None:
            entry[cm.tel] = s['phone'].strip()
        if s['storeemail'] is not None:
            entry[cm.email] = s['storeemail'].strip()
        if s['storelink'] is not None and u'@' not in s['storelink']:
            entry[cm.url] = s['storelink'].strip()
        if s['storetype'] is not None:
            entry[cm.store_class] = s['storetype'].strip()
        hours = []
        for item in ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']:
            if s[item] is not None:
                hours.append('%s: %s' % (item, s[item]))
        entry[cm.hours] = ', '.join(hours)
        styles = []
        for item in ['menswear', 'womenswear', 'kidswear']:
            if s[item] == '1':
                styles.append(item)
        entry[cm.store_type] = ', '.join(styles)
        if s['latitude'] is not None and s['latitude'].strip() != '':
            entry[cm.lat] = string.atof(s['latitude'])
        if s['longitude'] is not None and s['longitude'].strip() != '':
            entry[cm.lng] = string.atof(s['longitude'])

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
            gs.field_sense(entry)

        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
    return store_list
示例#36
0
        # cm.dump('Error in fetching stores: %s' % url, log_name)
        return ()

    m = re.search(ur'<div class="col">\s*<h3>Boutique</h3>\s*<div class="content">(.+?)</div>', body, re.S)
    if not m:
        return ()
    sub = m.group(1)

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    md5 = hashlib.md5()
    md5.update(url)
    entry[cm.native_id] = md5.hexdigest()

    entry[cm.country_e] = data['country_code']
    if entry[cm.country_e] == 'US':
        tmp_list = tuple(tmp.strip() for tmp in cm.reformat_addr(data['city']).strip(','))
        if len(tmp_list) == 2:
            if re.search('[A-Z]{2}', tmp_list[1]) or tmp_list[1] == 'D.C.':
                entry[cm.province_e] = tmp_list[1]
    entry[cm.city_e] = cm.extract_city(data['city'])[0]
    entry[cm.province_e] = data['state'] if data['state_code'] else ''

    sub_list = re.findall(ur'<p>(.+?)</p>', m.group(1), re.S)
    if len(sub_list) < 2:
        return ()
    title_list = tuple(tmp.strip() for tmp in cm.reformat_addr(sub_list[0]).split(','))
    entry[cm.name_e] = title_list[0]
    if len(title_list) > 1:
        entry[cm.store_class] = title_list[1]

    entry[cm.addr_e] = cm.reformat_addr(sub_list[1])
示例#37
0
    store_list = []
    for store in tree.iter('poi'):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        val = store.getiterator('uid')[0].text
        if val in store_map:
            continue
        store_map[val] = entry

        val = store.getiterator('name')[0].text
        entry[cm.name_e] = cm.html2plain(val).strip() if val else ''

        addr_list = []
        for idx in xrange(1, 3):
            val = store.getiterator('address%d' % idx)[0].text
            if val:
                val = cm.reformat_addr(val)
                if val != '':
                    addr_list.append(val)
        entry[cm.addr_e] = ', '.join(addr_list)

        val = store.getiterator('city')[0].text
        entry[cm.city_e] = cm.extract_city(val)[0] if val else ''
        val = store.getiterator('province')[0].text
        entry[cm.province_e] = cm.html2plain(val).strip().upper() if val else ''
        if entry[cm.province_e] == '':
            val = store.getiterator('state')[0].text
            entry[cm.province_e] = cm.html2plain(val).strip().upper() if val else ''
        val = store.getiterator('country')[0].text
        entry[cm.country_e] = val.strip().upper() if val else ''

        val = store.getiterator('email')[0].text
示例#38
0
文件: furla.py 项目: haizi-zh/firenze
def fetch_stores(data):
    url = '%s%d/' % (data['store_url'], data['city_id'])
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.finditer(ur'<div class="store">', html):
        store_sub, ss, se = cm.extract_closure(html[m.start():], ur'<div\b',
                                               ur'</div')
        if set == 0:
            continue

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])

        start = store_sub.find('<div class="store_name">')
        if start == -1:
            continue
        sub, start, end = cm.extract_closure(store_sub[start:], ur'<div\b',
                                             ur'</div>')
        if end == 0:
            continue
        m1 = re.search(ur'<p>(.+?)</p>', sub)
        if m1 is not None:
            entry[cm.name_e] = m1.group(1).strip()

        start = store_sub.find('<div class="store_address">')
        if start == -1:
            continue
        sub, start, end = cm.extract_closure(store_sub[start:], ur'<div\b',
                                             ur'</div>')
        if end == 0:
            continue
        m1 = re.search(ur'<p>(.+?)</p>', sub, re.S)
        if m1 is not None:
            addr_list = cm.reformat_addr(m1.group(1)).split(',')
            tmp = []
            tel_pat = re.compile(ur'^tel[\.: ]+', re.I)
            for term in addr_list:
                if re.search(tel_pat, term.strip()) is not None:
                    term = re.sub(tel_pat, '', term.strip())
                    entry[cm.tel] = cm.extract_tel(term)
                else:
                    tmp.append(term.strip())
            entry[cm.addr_e] = ', '.join(tmp)

        m1 = re.search(ur'<input\s.+?name="latitude"\s+value="(.+?)"\s*/>',
                       store_sub)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))
        m1 = re.search(ur'<input\s.+?name="longitude"\s+value="(.+?)"\s*/>',
                       store_sub)
        if m1 is not None:
            entry[cm.lng] = string.atof(m1.group(1))

        entry[cm.country_e] = data[cm.country_e]
        entry[cm.continent_e] = data[cm.continent_e]
        entry[cm.city_e] = data[cm.city_e]
        gs.field_sense(entry)
        print '(%s / %d) Found store: %s, %s (%s, %s)' % (
            data['brandname_e'], data['brand_id'], entry[cm.name_e],
            entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
示例#39
0
def get_store_details(data):
    url = data['url']
    try:
        html = cm.post_data(
            url, {
                'country': data['country_id'],
                'city': data['city_id'],
                'recordid': data['store_id']
            })
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': brand_id
        }
        cm.dump(dump_data)
        return []

    entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
    info = json.loads(html)['elements']
    addr = cm.reformat_addr(info['address'].replace('\\', '').replace(
        '<p>', ',').replace('</p>', ','))
    # 第一行为商店名称
    terms = addr.split(',')
    if len(terms) > 0:
        entry[cm.name_e] = cm.reformat_addr(terms[0])
    entry[cm.addr_e] = addr

    gmap_url = info['gmap']
    m = re.findall(ur'(-?\d+\.\d+),(-?\d+\.\d+)', gmap_url)
    if len(m) > 0:
        cm.update_entry(entry, {
            cm.lat: string.atof(m[0][0]),
            cm.lng: string.atof(m[0][1])
        })

    entry[cm.url] = info['shareurl'].replace('\\', '')
    entry[cm.hours] = info['openingtimes']
    entry[cm.comments] = info['other']

    # Geo
    country = data['country']
    city = data['city']
    cm.update_entry(entry, {cm.country_e: country, cm.city_e: city})
    entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    print '(%s / %d) Found store: %s, %s (%s, %s)' % (
        brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e],
        entry[cm.country_e], entry[cm.continent_e])

    db.insert_record(entry, 'stores')
    return entry
示例#40
0
文件: tod.py 项目: haizi-zh/firenze
            entry[cm.name_e] = cm.html2plain(m.group(1)).strip()

        m = re.search(ur'<brands>(.+?)</brands>', s)
        if m is not None:
            brand_list = []
            for m1 in re.findall(ur'<brand>(.+?)</brand>', m.group(1)):
                brand_list.append(m1)
            entry[cm.store_type] = ', '.join(brand_list)

        m = re.search(ur'<city>(.+?)</city>', s)
        if m is not None:
            entry[cm.city_e] = cm.html2plain(m.group(1)).strip().upper()

        m = re.search(ur'<address>(.+?)</address>', s)
        if m is not None:
            entry[cm.addr_e] = cm.reformat_addr(m.group(1)).strip()

        m = re.search(ur'<phone>(.+?)</phone>', s)
        if m is not None:
            entry[cm.tel] = m.group(1).strip()

        m = re.search(
            ur'<(?:latitude|latitiude)>(.+?)</(?:latitude|latitiude)>', s)
        if m is not None:
            entry[cm.lat] = string.atof(m.group(1))
        m = re.search(ur'<longitude>(.+?)</longitude>', s)
        if m is not None:
            entry[cm.lng] = string.atof(m.group(1))

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
示例#41
0
    try:
        body = cm.get_data(url)
    except Exception, e:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return ()

    m = re.search(ur'<div class="contact-info">(.+?)</div>', body, re.S)
    if m is None:
        return s

    entry = s.copy()
    pat_tel = re.compile(ur'\s*Numéro de téléphone\s*[:\.]\s*')
    pat_fax = re.compile(ur'\s*Numéro de fax\s*[:\.]\s*')
    pat_email = re.compile(ur'\s*Adresse électronique\s*[:\.]\s*')
    for term in [
            tmp.strip() for tmp in cm.reformat_addr(m.group(1)).split(',')
    ]:
        if re.search(pat_tel, term):
            entry[cm.tel] = re.sub(pat_tel, '', term).strip()
        if re.search(pat_fax, term):
            entry[cm.fax] = re.sub(pat_fax, '', term).strip()
        if re.search(pat_email, term):
            entry[cm.email] = re.sub(pat_email, '', term).strip()
    return entry


def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception, e:
示例#42
0
def fetch_stores(data):
    url = data['data_url']
    logger = logging.getLogger('firenzeLogger')
    param = {'storeId': 10551, 'catalogId': 10051, 'countryTab': 'in', 'countryCode': data['country_code']}
    if data['country_code'] == 'US':
        param['radius'] = 20
        param['state'] = data['state_code']
    try:
        body = cm.get_data(url, param)['body']
    except Exception as e:
        logger.error('Error in fetching stores: %s, %s' % (url, param))
        return ()

    store_list = []
    for item in (pq(tmp) for tmp in pq(body)('div.vcard')):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['country_code']
        tmp = item('div.resultsHeader a b')
        if len(tmp) > 0 and tmp[0].text:
            entry[cm.name_e] = cm.html2plain(tmp[0].text)
        tmp = item('div.adr')
        if len(tmp) > 0:
            tmp = pq(tmp[0])
            entry[cm.addr_e] = cm.reformat_addr(unicode(tmp))
            tmp1 = tmp('.locality')
            if len(tmp1) > 0 and tmp1[0].text:
                entry[cm.city_e] = cm.extract_city(tmp1[0].text)[0]
            tmp1 = tmp('.region')
            if len(tmp1) > 0 and tmp1[0].text:
                entry[cm.province_e] = cm.html2plain(tmp1[0].text).strip().upper()
            tmp1 = tmp('.postal-code')
            if len(tmp1) > 0 and tmp1[0].text:
                entry[cm.zip_code] = tmp1[0].text

        tmp = item('div.tel')
        if len(tmp) > 0:
            entry[cm.tel] = tmp[0].text if tmp[0].text else ''

        tmp = item('div.store_hours')
        if len(tmp) > 0:
            entry[cm.hours] = cm.reformat_addr(unicode(pq(tmp[0])))

        tmp = item('#map')
        if len(tmp) > 0:
            m = re.search(ur'Lat=(-?\d+\.\d+)', unicode(pq(tmp[0])))
            if m:
                entry[cm.lat] = string.atof(m.group(1))
            m = re.search(ur'Lng=(-?\d+\.\d+)', unicode(pq(tmp[0])))
            if m:
                entry[cm.lng] = string.atof(m.group(1))

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        # cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
        #                                                     entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
        #
        #                                                     entry[cm.continent_e]), log_name)
        # cm.insert_record(data['database'], entry, 'stores')
        store_list.append(entry)

    return tuple(store_list)
示例#43
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    store_list = []
    for m in re.finditer(ur'<div\s+class\s*=\s*"storeItem"', body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])

        sub = cm.extract_closure(body[m.end():], ur'<div\b', ur'</div>')[0]
        m1 = re.search(ur'<div class="bubbleInfo">(.+?)</div>', sub)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1))
        m1 = re.search(ur'lat="(-?\d+\.\d+)"', sub)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))
        m1 = re.search(ur'lng="(-?\d+\.\d+)"', sub)
        if m1 is not None:
            entry[cm.lng] = string.atof(m1.group(1))
        m1 = re.search(ur'<span>\s*Tel:\s*([^<>]+)</span>', sub)
        if m1 is not None:
            entry[cm.tel] = m1.group(1).strip()
        m1 = re.search(ur'http://maps\.google\.com/maps\?q=([^&"]+)', sub)
        if m1 is None:
            continue
        ret = gs.geocode(latlng=m1.group(1))
        if ret is None:
            tmp = [tmp1.strip() for tmp1 in entry[cm.addr_e].split(',')]
            if 'MAX' in tmp[0]:
                del tmp[0]
            if cm.extract_tel(tmp[-1])!='':
                del tmp[-1]
            if len(tmp) > 0:
                ret = gs.geocode(', '.join(tmp))
        if ret is not None:
            city = ''
            province = ''
            country = ''
            zip_code = ''
            tmp = ret[0]['address_components']
            for v in tmp:
                if 'locality' in v['types']:
                    city = v['long_name'].strip().upper()
                elif 'administrative_area_level_1' in v['types']:
                    province = v['long_name'].strip().upper()
                elif 'country' in v['types']:
                    country = v['long_name'].strip().upper()
                elif 'postal_code' in v['types']:
                    zip_code = v['long_name'].strip()
            entry[cm.country_e] = country
            entry[cm.province_e] = province
            entry[cm.city_e] = city
            entry[cm.zip_code] = zip_code
            gs.field_sense(entry)
            cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                                entry[cm.continent_e]), log_name)
            db.insert_record(entry, 'stores')
            store_list.append(entry)
        else:
            cm.dump('Error in fetching stores: latlng=%s, addr=%s' % (m1.group(1), entry[cm.addr_e]), log_name)
            continue
示例#44
0
    try:
        details = pq(pq(body)('.store-details')[0])

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.country_e] = data['country']
        entry[cm.province_e] = data['state'] if data['state'] else ''
        entry[cm.url] = url
        entry[cm.name_e] = data['store_name']
        entry[cm.city_e] = data['city'] if data['city'] else ''

        if data['addr']:
            entry[cm.addr_e] = data['addr']
        else:
            entry[cm.addr_e] = cm.reformat_addr(unicode(pq(details('p')[0])))

        if data['tel']:
            entry[cm.tel] = data['tel']
        else:
            tmp = details('p')[1].text
            pat = re.compile(ur'(phone|tel|telephone)\s*[\.: ]?\s*', re.I)
            if re.search(pat, tmp):
                entry[cm.tel] = re.sub(pat, '', tmp).strip()

        sub = unicode(details)
        start = sub.find(u'Regular Store Hours')
        if start != -1:
            m = re.search(ur'<p>(.+?)<ul', sub[start:], re.S)
            if m:
                entry[cm.hours] = cm.reformat_addr(m.group(1))
示例#45
0
            entry[cm.tel] = s['phone'] if s['phone'] else ''
            entry[cm.url] = (
                data['host'] +
                s['storeDetailUrl']) if s['storeDetailUrl'] else ''

            hour_list = []
            try:
                body = cm.get_data(entry[cm.url], client='iPad')
                html = pq(body)
                for sub in (pq(tmp) for tmp in html(
                        'table.storeDetailed-horaires-content tr')):
                    tmp = sub('td[class!="hours"]')
                    if len(tmp) == 0:
                        continue
                    val1 = cm.reformat_addr(tmp[0].text).strip()
                    tmp = sub('td.hours')
                    if len(tmp) == 0:
                        continue
                    val2 = cm.reformat_addr(tmp[0].text).strip()
                    if val1 == '' or val2 == '':
                        continue
                    hour_list.append('%s %s' % (val1, val2))

                tmp = html('div.storeDetailed-horaires-content')
                if len(tmp) > 0:
                    hour_list.append('Closing days: ' +
                                     cm.reformat_addr(tmp[0].text).strip())
                entry[cm.hours] = ', '.join(hour_list)
            except Exception as e:
                print traceback.format_exc()
示例#46
0
def fetch(level=1, data=None, user='******', passwd=''):
    db = common.StoresDb()
    db.connect_db(user=user, passwd=passwd)
    db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id))

    data = {
        's': -89,
        'w': -179,
        'n': 89,
        'e': 179,
        'chinese': 0,
        'repair': 1,
        'store': 1
    }
    try:
        html = common.get_data(url_init, data)
    except Exception:
        print 'Error occured in getting the list of countries: %s' % url_init
        dump_data = {
            'level': 1,
            'time': common.format_time(),
            'data': {
                'data': url_init
            },
            'brand_id': brand_id
        }
        common.dump(dump_data)
        return []

    store_list = []

    store_map = json.loads(html)
    tot = 0
    while True:
        # 得到{'uid':entry}的字典
        tmp = store_map['lists']
        # 是否有'more'
        flag = False
        if 'has_key' not in dir(tmp):
            raw_stores = {}
            for item in tmp:
                if 'more' in item:
                    flag = item['more']
                else:
                    raw_stores[item['nid']] = item
        else:
            raw_stores = tmp
            for k in tmp:
                if 'more' in tmp[k]:
                    flag = tmp[k]['more']
                    break

        # 分析raw_stores
        for k in raw_stores:
            s = raw_stores[k]
            if 'more' in s:
                flag = s['more']
            else:
                entry = common.init_store_entry(brand_id, brandname_e,
                                                brandname_c)

                if s['country'] is not None:
                    country_c = s['country'].strip().upper()
                    ret = gs.look_up(country_c, 1)
                    if ret is not None:
                        entry[common.country_e] = ret['name_e']
                        entry[common.country_c] = ret['name_c']
                    else:
                        if common.is_cjk(country_c):
                            entry[common.country_c] = country_c
                        else:
                            entry[common.country_e] = country_c

                if s['address'] is not None:
                    addr = common.reformat_addr(s['address'])
                    if common.is_cjk(addr):
                        entry[common.addr_c] = addr
                    else:
                        entry[common.addr_e] = addr

                city = s['city']
                if city is not None:
                    city = city.strip().upper()
                    ret = gs.look_up(city, 3)
                    if ret is not None:
                        entry[common.city_c] = ret['name_c']
                        entry[common.city_e] = ret['name_e']
                    else:
                        if common.is_cjk(city):
                            entry[common.city_c] = city
                        else:
                            entry[common.city_e] = city

                entry[common.city_e] = common.extract_city(
                    entry[common.city_e])[0]

                if s['email'] is not None:
                    entry[common.email] = s['email']
                if s['fax'] is not None:
                    entry[common.fax] = s['fax']
                if s['latitude'] is not None:
                    entry[common.lat] = string.atof(s['latitude'])
                if s['longitude'] is not None:
                    entry[common.lng] = string.atof(s['longitude'])
                if s['phone'] is not None:
                    entry[common.tel] = s['phone']
                if s['postal_code'] is not None:
                    entry[common.zip_code] = s['postal_code']

                if s['title'] is not None:
                    name = s['title']
                    if common.is_cjk(name):
                        entry[common.name_c] = name
                    else:
                        entry[common.name_e] = name

                if s['operating_hours'] is not None:
                    entry[common.hours] = s['operating_hours']
                if s['url'] is not None:
                    entry[common.url] = host + s['url']

                gs.field_sense(entry)

                print '%s: Found store: %s, %s (%s, %s)' % (
                    brandname_e, entry[common.name_e], entry[common.addr_e],
                    entry[common.country_e], entry[common.continent_e])
                db.insert_record(entry, 'stores')
                store_list.append(entry)

        if flag:
            tot += len(store_map['lists']) - 1
            data['offset'] = tot
            store_map = json.loads(common.get_data(url_more, data))
            continue
        else:
            tot += len(store_map['lists'])
            break
    print 'Found a total of %d stores.' % tot
    db.disconnect_db()
    return store_list
示例#47
0
    body = re.sub(ur'GetLocalLevisCallback\(', '', body)[:-1]
    for s in json.loads(body)['d']['results']:
        try:
            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                        data['brandname_c'])

            uid = s['__metadata']['uri']
            if uid in store_map:
                cm.dump(u'%s already exists.' % uid, log_name)
                continue

            entry[cm.country_e] = cm.html2plain(
                s['CountryRegion']).strip().upper()
            entry[cm.native_id] = uid
            entry[cm.city_e] = cm.extract_city(s['Locality'])[0]
            entry[cm.addr_e] = cm.reformat_addr(s['AddressLine'])

            entry[cm.zip_code] = s['PostalCode']
            entry[cm.tel] = s['Phone']
            entry[cm.name_e] = cm.html2plain(
                s['BranchName']).strip() if s['BranchName'] else ''

            try:
                entry[cm.lat] = string.atof(
                    s['Latitude']) if s['Latitude'] != '' else ''
            except (ValueError, KeyError, TypeError) as e:
                cm.dump('Error in fetching lat: %s' % str(e), log_name)
            try:
                entry[cm.lng] = string.atof(
                    s['Longitude']) if s['Longitude'] != '' else ''
            except (ValueError, KeyError, TypeError) as e:
示例#48
0
def fetch_stores(data):
    url = data['host'] + 'after-sales-services/boutique-finder'
    param = {
        'productOffer': 'All',
        'city': data['city_id'],
        'boutiqueType': 'All',
        'country': data['country_id']
    }
    if data['state'] is not None:
        param['prefecture'] = data['state']['state_id']

    page = 0
    totStore = -1
    store_list = []
    while True:
        if totStore != -1 and len(store_list) >= totStore:
            break
        else:
            page += 1

        param['numPageToGet'] = page
        try:
            body = cm.get_data(url, param)
        except Exception, e:
            # cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
            break

        m = re.search(ur'<list id="WS_boutique_list" nbBoutique="(\d+)">',
                      body)
        if m is None:
            # cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
            break
        totStore = string.atoi(m.group(1))

        sub = cm.extract_closure(body[m.start():], ur'<list\b', ur'</list>')[0]
        for m in re.finditer(ur'<list id="WS_boutique_\d+">', sub):
            store_sub = cm.extract_closure(sub[m.start():], ur'<list\b',
                                           ur'</list>')[0]
            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                        data['brandname_c'])
            entry[cm.city_e] = cm.extract_city(data['city_name'])[0]
            entry[cm.country_e] = data['country_name']
            if data['state'] is not None:
                entry[cm.province_e] = data['state']['state_name']

            m1 = re.search(ur'productOffers="([^"]+)"', store_sub)
            if m1 is not None:
                entry[cm.store_type] = m1.group(1).strip()
            m1 = re.search(ur'boutiqueType="([^"]+)">', store_sub)
            if m1 is not None:
                entry[cm.store_class] = m1.group(1).strip()
            m1 = re.search(ur"<p class='boutique_title'>(.+?)</p>", store_sub)
            if m1 is not None:
                entry[cm.name_e] = m1.group(1).strip()
            m1 = re.search(
                ur'<object type="text" id="WS_boutique_detail[^"]+">(.+?)</object>',
                store_sub, re.S)
            if m1 is not None:
                m2 = re.search(ur'<p [^<>]*>(.+?)</p>', m1.group(1), re.S)
                if m2 is not None:
                    addr_list = []
                    for term in (tmp.strip() for tmp in cm.reformat_addr(
                            m2.group(1)).split(',')):
                        pat_tel = re.compile(ur'phone:\s*', re.I)
                        pat_fax = re.compile(ur'fax:\s*', re.I)
                        pat_email = re.compile(
                            r'(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])'
                        )
                        if re.search(pat_tel, term) is not None:
                            entry[cm.tel] = re.sub(pat_tel, '', term).strip()
                        elif re.search(pat_fax, term) is not None:
                            entry[cm.fax] = re.sub(pat_fax, '', term).strip()
                        elif re.search(pat_email, term) is not None:
                            entry[cm.email] = re.search(pat_email,
                                                        term).group()
                        else:
                            addr_list.append(term)
                    entry[cm.addr_e] = ', '.join(addr_list)

            gs.field_sense(entry)
            ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
            if ret[1] is not None and entry[cm.province_e] == '':
                entry[cm.province_e] = ret[1]
            if ret[2] is not None and entry[cm.city_e] == '':
                entry[cm.city_e] = ret[2]
            gs.field_sense(entry)
            logger.info(
                '(%s / %d) Found store: %s, %s (%s, %s)' %
                (data['brandname_e'], data['brand_id'], entry[cm.name_e],
                 entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]))
            # db.insert_record(entry, 'stores')
            store_list.append(entry)
示例#49
0
文件: ca.py 项目: haizi-zh/ofashion
                                                  entry[cm.zip_code])
            except (IndexError, TypeError):
                pass
            try:
                tmp = store('td.address span.tel')[0].text
                entry[cm.tel] = tmp if tmp else ''
            except IndexError:
                pass
            try:
                tmp = store('td.address span.fax')[0].text
                entry[cm.fax] = tmp if tmp else ''
            except IndexError:
                pass

            hours_list = []
            for item in (cm.reformat_addr(unicode(pq(tmp)))
                         for tmp in store('td.opening table tr')):
                if 'opening times' in item.lower():
                    continue
                hours_list.append(re.sub(ur':\s*,\s*', ': ', item))
            entry[cm.hours] = ', '.join(hours_list)

            gs.field_sense(entry)
            if entry[cm.addr_e]:
                ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
                if ret[1] is not None and entry[cm.province_e] == '':
                    entry[cm.province_e] = ret[1]
                if ret[2] is not None and entry[cm.city_e] == '':
                    entry[cm.city_e] = ret[2]
                gs.field_sense(entry)
示例#50
0
    start = body.find(ur'<div class="storeLocation">')
    if start == -1:
        cm.dump('Error in fetching store details: %s' % url, log_name)
        return ()
    sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                data['brandname_c'])
    entry[cm.country_e] = data['country']
    entry[cm.url] = url

    m = re.search(ur'<h2 class="title"\s*>([^<>]+)</h2>', sub)
    entry[cm.name_e] = cm.html2plain(m.group(1)).strip() if m else ''

    m = re.search(ur'<address class="address"\s*>(.+?)</address>', sub, re.S)
    entry[cm.addr_e] = cm.reformat_addr(m.group(1)) if m else ''

    m = re.search(ur'<p>(.+)</p>', sub[m.end():], re.S)
    contact_list = []
    pat_tel = re.compile(ur'phone[\s\d]*[:\.]\s*', re.I)
    pat_fax = re.compile(ur'fax[\s\d]*[:\.]\s*', re.I)
    pat_email = re.compile(ur'email[\s\d]*[:\.]\s*', re.I)
    for term in (tmp.strip()
                 for tmp in cm.reformat_addr(m.group(1)).split(',')):
        if re.search(pat_tel, term):
            entry[cm.tel] = re.sub(pat_tel, '', term).strip()
        elif re.search(pat_fax, term):
            entry[cm.fax] = re.sub(pat_fax, '', term).strip()
        elif re.search(pat_email, term):
            entry[cm.email] = re.sub(pat_email, '', term).strip()
示例#51
0
    store_list = []
    for s in re.findall(ur'<div class="store_wrapper">(.+?)</div>', body,
                        re.S):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.country_e] = data['country']

        m = re.search(ur'<h2>(.+?)</h2>', s)
        if m is not None:
            entry[cm.name_e] = cm.html2plain(m.group(1))

        m = re.search(ur'<p>(.+?)</p>', s, re.S)
        if m is not None:
            addr_list = [
                tmp.strip() for tmp in cm.reformat_addr(m.group(1)).split(',')
            ]
            tel = cm.extract_tel(
                re.sub(re.compile('^\s*t\s*(\.|:)\s*', re.I), '',
                       addr_list[-1]))
            if tel != '':
                if entry[cm.country_e] == 'CHINA':
                    if len(re.findall(r'\d', tel)) > 6:
                        entry[cm.tel] = tel
                        del addr_list[-1]
                else:
                    entry[cm.tel] = tel
                    del addr_list[-1]
            entry[cm.addr_e] = ', '.join(addr_list)

        gs.field_sense(entry)
示例#52
0
文件: tudor.py 项目: haizi-zh/firenze
def fetch_stores(data):
    url = data['data_url']
    param = {'lang': data['lang'], 'country': data['country_id'], 'region': data['region_id'],
             'city': data['city_id']}
    try:
        body = cm.get_data(url, param)
    except Exception:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), 'tudor_log.txt')
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    for store in (pq(tmp) for tmp in pq(body.encode('utf-8'))('dealer')):
        try:
            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
            entry[cm.country_e] = data['country_code']
            entry[cm.province_e] = data['region_name'].replace('PROVINCE', '').strip()
            entry[cm.city_e] = data['city_name']

            store_id = store[0].attrib['id']
            if store_id in id_set:
                if data['country_code'] == 'CN':
                    entry = id_set[store_id]

                    entry[cm.name_c] = cm.reformat_addr(store('name')[0].text).strip()
                    tmp = store('address')
                    entry[cm.addr_c] = cm.reformat_addr(tmp[0].text) if len(tmp) > 0 and tmp[0].text else ''
                    entry[cm.province_c] = data['region_name']
                    entry[cm.city_c] = data['city_name']

                    db.execute(u'DELETE FROM stores WHERE brand_id=%d AND native_id="%s"' % (
                        data['brand_id'], entry[cm.native_id]))
                    db.insert_record(entry, 'stores')
            else:
                entry[cm.native_id] = store_id

                entry[cm.name_e] = cm.reformat_addr(store('name')[0].text).strip()
                tmp = store('address')
                entry[cm.addr_e] = cm.reformat_addr(tmp[0].text) if len(tmp) > 0 and tmp[0].text else ''
                tmp = store('phone1')
                entry[cm.tel] = tmp[0].text.strip() if len(tmp) > 0 and tmp[0].text else ''

                tmp = store('fax1')
                entry[cm.fax] = tmp[0].text.strip() if len(tmp) > 0 and tmp[0].text else ''

                tmp = store('latitude')
                try:
                    entry[cm.lat] = string.atof(tmp[0].text) if len(tmp) > 0 and tmp[0].text else ''
                except (ValueError, KeyError, TypeError) as e:
                    cm.dump('Error in fetching lat: %s' % str(e), log_name)
                tmp = store('longitude')
                try:
                    entry[cm.lng] = string.atof(tmp[0].text) if len(tmp) > 0 and tmp[0].text else ''
                except (ValueError, KeyError, TypeError) as e:
                    cm.dump('Error in fetching lng: %s' % str(e), log_name)

                gs.field_sense(entry)
                ret = gs.addr_sense(entry[cm.addr_e])
                if ret[0] is not None and entry[cm.country_e] == '':
                    entry[cm.country_e] = ret[0]
                if ret[1] is not None and entry[cm.province_e] == '':
                    entry[cm.province_e] = ret[1]
                if ret[2] is not None and entry[cm.city_e] == '':
                    entry[cm.city_e] = ret[2]
                gs.field_sense(entry)
                cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                    entry[cm.name_e], entry[cm.addr_e],
                                                                    entry[cm.country_e],
                                                                    entry[cm.continent_e]), 'tudor_log.txt')
                db.insert_record(entry, 'stores')
                id_set[store_id] = entry
                store_list.append(entry)
        except (IndexError, TypeError) as e:
            print traceback.format_exc()
            continue

    return store_list
示例#53
0
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        logger.info('(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                    entry[cm.name_e], entry[cm.addr_e],
                                                                    entry[cm.city_e],
                                                                    entry[cm.country_e], entry[cm.continent_e]))
        cm.insert_record(db, entry, 'spider_stores.stores')
        store_list.append(entry)
    elif data['m'] == 5:
        for country in (pq(tmp) for tmp in pq(body)('table[cellpadding="6"]')):
            country_e = cm.html2plain(country('td[style="color:#FFF;"]')[0].text).strip().upper()
            country_e = 'UAE' if 'arab emirates' in country_e.lower() else country_e
            for store in country('td[valign="top"]'):
                if 'bgcolor' in store.attrib:
                    continue
                addr_raw = cm.reformat_addr(unicode(pq(store)))
                if addr_raw == '':
                    continue
                addr_list = [tmp.strip() for tmp in addr_raw.split(',')]
                entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
                entry[cm.name_e] = addr_list[0]
                entry[cm.country_e] = country_e
                del addr_list[0]
                if country_e in ('HONG KONG', 'JAPAN', 'UAE') or (
                                country_e == 'THAILAND' and 'ext.' in addr_list[-1]):
                    entry[cm.tel] = addr_list[-1]
                    del addr_list[-1]
                entry[cm.addr_e] = ', '.join(addr_list)

                gs.field_sense(entry)
                ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
示例#54
0
文件: y3.py 项目: haizi-zh/firenze
def fetch_stores(data):
    """
    获得门店的详细信息
    :rtype : [entries]
    :param data:
    """
    try:
        html = cm.get_data(data['url'])
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': brand_id
        }
        cm.dump(dump_data)
        return []

    entries = []
    start = html.find(u'<ul class="store-list">')
    if start == -1:
        return entries
    start += len(u'<ul class="store-list">')
    end = html.find(u'</ul>', start)
    html = html[start:end]

    for m1 in re.findall(ur'<li class="(.*?)">(.*?)</li>', html, re.S):
        store = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        store[cm.store_type] = m1[0]
        sub_html = m1[1]
        m2 = re.findall(ur'<h3 class="store-name">(.*?)</h3>', sub_html)
        if len(m2) > 0:
            store[cm.name_e] = cm.reformat_addr(m2[0])
        m2 = re.findall(ur'<p class="store-address">(.*?)</p>', sub_html, re.S)
        if len(m2) > 0:
            store[cm.addr_e] = cm.reformat_addr(m2[0])

        cm.update_entry(
            store, {
                cm.continent_e: data[cm.continent_e].strip().upper(),
                cm.country_e: data[cm.country_e].strip().upper(),
                cm.city_e: data[cm.city_e].strip().upper()
            })

        entry = store
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

        print '%s: Found store: %s, %s (%s, %s)' % (
            brandname_e, store[cm.name_e], store[cm.addr_e],
            store[cm.country_e], store[cm.continent_e])
        db.insert_record(store, 'stores')
        entries.append(store)
示例#55
0
def fetch_stores(data):
    url = data['url']
    try:
        html, cookie_map = cm.get_data_cookie(url)
    except Exception:
        print 'Error occured in getting country list: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    print 'SLEEPING>>>>'
    time.sleep(5)

    m = re.search(
        'http://www.ninewest.com/on/demandware.store/Sites-ninewest-Site/default/Stores-Find/C\d{10}',
        html)
    if m is None:
        return []
    url = m.group(0)

    cookie_map_new = {}
    for key in cookie_map:
        if 'dwpersonalization_' in key or key == 'sr_token':
            continue
        cookie_map_new[key] = cookie_map[key]
    cookie_map_new['invited_visitor_22225'] = '1'
    cookie_map = cookie_map_new

    try:
        html = cm.post_data(url, {
            'dwfrm_storelocator_startaddress': 'kingman',
            'dwfrm_storelocator_maxDistance': 30.00,
            'dwfrm_storelocator_outlet': 'true',
            'dwfrm_storelocator_retail': 'true',
            'dwfrm_storelocator_optical': 'true',
            'dwfrm_storelocator_eyewear': 'true',
            'dwfrm_storelocator_apparel': 'true',
            'dwfrm_storelocator_attire': 'true',
            'dwfrm_storelocator_department': 'true',
            'dwfrm_storelocator_IsMensFootwear': 'true',
            'dwfrm_storelocator_IsRRR': 'true',
            'dwfrm_storelocator_IsRRNY': 'true',
            'dwfrm_storelocator_IsRRS': 'true',
            'dwfrm_storelocator_wholesale': 'true',
            'dwfrm_storelocator_bba': 'true',
            'dwfrm_storelocator_ba': 'true',
            'dwfrm_storelocator_search.x': 0,
            'dwfrm_storelocator_search.y': 0,
            'dwfrm_storelocator_countryCode': 'US',
            'dwfrm_storelocator_postalCode': '67068',
            'dwfrm_storelocator_distanceUnit': 'mi',
            'dwfrm_storelocator_long': -98.117208,
            'dwfrm_storelocator_lat': 37.647131,
        },
                            cookie=cookie_map)
    except Exception:
        print 'Error occured in getting country list: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    store_list = []
    for m1 in re.finditer(ur'<div class="storeColumnOne">', html):
        sub, start, end = cm.extract_closure(html[m1.start():], ur'<div\b',
                                             ur'</div>')
        if end == 0:
            continue

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        m2 = re.search(ur'<div class="storename">([^<>]+)</div>', sub)
        if m2 is not None:
            entry[cm.name_e] = m2.group(1).strip()

        addr_list = [
            m2 for m2 in re.findall(
                ur'<div class="adddressline">([^<>]+)</div>', sub)
        ]
        entry[cm.addr_e] = ', '.join(addr_list)

        m2 = re.search(ur'<div class="citystatezip">([^<>]+)</div>', sub)
        if m2 is not None:
            tmp = cm.reformat_addr(m2.group(1))
            terms = re.split('[, ]+', tmp)
            if len(terms) < 3:
                entry[cm.addr_e] = tmp
            else:
                ret = gs.look_up(terms[0], 3)
                if ret is not None:
                    entry[cm.city_e] = ret['name_e']
                else:
                    entry[cm.city_e] = terms[0].strip().upper()

                ret = gs.look_up(terms[1], 2)
                if ret is not None:
                    entry[cm.province_e] = ret['name_e']
                else:
                    entry[cm.province_e] = terms[0].strip().upper()

                if re.match('\s*\d{5,}\s*', terms[2]) is not None:
                    entry[cm.zip_code] = terms[2].strip()

        m2 = re.search(ur'<div class="storephone">([^<>]+)</div>', sub)
        if m2 is not None:
            entry[cm.tel] = m2.group(1)

        cm.update_entry(entry, {
            'country_e': 'UNITED STATES',
            'continent_e': 'NORTH AMERICA'
        })
        gs.field_sense(entry)
        print '(%s / %d) Found store: %s, %s (%s, %s)' % (
            data['brandname_e'], data['brand_id'], entry[cm.name_e],
            entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
示例#56
0
        # entry[cm.city_e] = data['city']

        store_sub = cm.extract_closure(sub[m.start():], ur'<tr\b', ur'</tr')[0]
        pat = re.compile(ur'<strong>([^<>]+)</strong>')
        m1 = re.search(pat, store_sub)
        if m1 is not None:
            entry[cm.name_e] = cm.html2plain(m1.group(1)).strip()
            store_sub = re.sub(pat, '', store_sub)
        start = store_sub.find(ur'<table>')
        if start != -1:
            addr_list = []
            pat_tel = re.compile(ur'phone\s*[:\.]', re.I)
            pat_fax = re.compile(ur'fax\s*[:\.]', re.I)
            for term in [
                    tmp.strip()
                    for tmp in cm.reformat_addr(store_sub[:start]).split(',')
            ]:
                if term == '':
                    continue
                elif re.search(pat_tel, term):
                    entry[cm.tel] = re.sub(pat_tel, '', term).strip()
                elif re.search(pat_fax, term):
                    entry[cm.fax] = re.sub(pat_fax, '', term).strip()
                else:
                    addr_list.append(term)
            entry[cm.addr_e] = ', '.join(addr_list)

        m1 = re.search(ur'href="([^"]+id=\d+)"', store_sub)
        if m1 is None or m1.group(1) in store_map:
            continue
示例#57
0
def fetch_stores(db, data, logger):
    brand_id, brand_name, url = (data[key]
                                 for key in ('brand_id', 'brandname_c', 'url'))

    # try:
    body = cm.get_data(url)
    q = pq(body)
    # except Exception, e:
    #     logger.error(unicode.format(u'Error in fetching contents for {0}', url))
    #     return ()

    m1 = re.search(ur'var\s+markers\s*=\s*\[', body)
    if not m1:
        logger.error(
            unicode.format(u'Error in finding stores for {0}:{1}', brand_id,
                           brand_name))
        return ()

    body = body[m1.end() - 1:]
    m2 = re.search(ur'\]\s*;', body)
    if not m2:
        logger.error(
            unicode.format(u'Error in finding stores for {0}:{1}', brand_id,
                           brand_name))
        return ()
    raw = json.loads(body[:m2.end() - 1])

    store_list = []
    for s in raw:
        entry = cm.init_store_entry(brand_id, brand_name, data['brandname_c'])
        # try:
        try:
            entry[cm.lat], entry[cm.lng] = (float(s['location'][idx])
                                            for idx in (0, 1))
        except (KeyError, IndexError, ValueError, TypeError):
            pass

        s = s['content']
        try:
            entry[cm.name_e] = cm.html2plain(s['title']).strip()
        except (KeyError, TypeError):
            pass

        tmp_list = s['analytics_label'].split('-')
        entry[cm.country_e] = tmp_list[0]
        entry[cm.city_e] = cm.extract_city(tmp_list[1])[0]

        try:
            entry[cm.addr_e] = cm.reformat_addr(s['address']).strip()
        except (KeyError, TypeError):
            pass

        try:
            entry[cm.fax] = s['fax'].strip()
        except (KeyError, TypeError):
            pass
        try:
            entry[cm.tel] = s['phone'].strip()
        except (KeyError, TypeError):
            pass
        try:
            entry[cm.email] = s['mail'].strip()
        except (KeyError, TypeError):
            pass
        try:
            entry[
                cm.
                url] = u'http://en.longchamp.com/store/map' + s['url'].strip()
        except (KeyError, TypeError):
            pass
        try:
            entry[cm.zip_code] = cm.html2plain(s['zipcode_town']).replace(
                tmp_list[1], '').strip()
        except (KeyError, TypeError):
            pass

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        logger.info(
            unicode.format(
                u'{0}:{1} FOUND STORE: {2}, {3}, ({4}, {5}, {6})',
                data['brand_id'], data['brandname_e'],
                *(entry[key] for key in (cm.name_e, cm.addr_e, cm.city_e,
                                         cm.country_e, cm.continent_e))))

        cm.insert_record(db, entry, 'spider_stores.stores')
        store_list.append(entry)

    return tuple(store_list)
示例#58
0
        url = data['url']
        param = {'br': '_1', 'ca': '_R', 'wr': 'HC', 'cn': u'中国', 'cr': data['province'], 'cy': data['city']}
        try:
            body = cm.get_data(url, param)
        except Exception, e:
            cm.dump('Error in fetching cities: %s, %s' % (url, param), log_name)
            return []
    else:
        body = data['body']

    store_list = []
    city = data['city']
    if city == '':
        m = re.search(ur'<span id="m_sthead"\s*>(.+?)</span>', body)
        if m is not None:
            city = cm.reformat_addr(m.group(1))
    city = city.replace(u'市', u'').strip()
    for m in re.finditer(ur'<span id="m_stname"[^<>]*>(.+?)</span>', body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['country']
        entry[cm.province_c] = data['province']
        ret = gs.look_up(data['province'], 2)
        if ret is not None:
            entry[cm.province_e] = ret['name_e']
        entry[cm.city_c] = city
        ret = gs.look_up(city, 3)
        if ret is not None:
            entry[cm.city_e] = ret['name_e']

        entry[cm.name_e] = cm.reformat_addr(m.group(1))
示例#59
0
文件: zegna.py 项目: haizi-zh/firenze
def get_stores(data):
    url = data['url']
    print 'Trying to get stores for %s' % data['name']
    try:
        html = common.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 1,
            'time': common.format_time(),
            'data': {
                'url': url
            },
            'brand_id': brand_id
        }
        common.dump(dump_data)
        return []

    start = 0
    store_list = []
    while True:
        start = html.find('<li class="info-store clearfix">', start)
        if start == -1:
            break
        end = html.find('<li class="info-store clearfix">', start + 1)
        sub_html = html[start:end]
        start = end

        entry = common.init_store_entry(brand_id, brandname_e, brandname_c)
        for m in re.findall(r'<h1><a href="(.*?)">(.*?)</a>', sub_html):
            entry[common.url] = host + m[0]
            entry[common.name_e] = common.html2plain(m[1].strip())
            break

        for m in re.findall(
                r'<span style="display:none" class="ll">\s*(-?\d+\.\d+),\s*(-?\d+\.\d+)\s*</span>',
                sub_html):
            common.update_entry(entry, {
                common.lat: string.atof(m[0]),
                common.lng: string.atof(m[1])
            })
            break

        for m in re.findall(r'<span class="map-address">(.*?)</span>',
                            sub_html):
            entry[common.addr_e] = common.reformat_addr(m)
            break

        for m in re.findall(r'<span class="type">phone:</span>(.*?)<br />',
                            sub_html):
            entry[common.tel] = m.strip()
            break

        for m in re.findall(r'<a class="email" href="mailto:(.*?@.*?)">',
                            sub_html):
            entry[common.email] = m.strip()
            break

        opening_s = sub_html.find('<ul class="opening-hours')
        if opening_s != -1:
            opening_e = sub_html.find('</ul>', opening_s)
            o_str = sub_html[opening_s:opening_e]
            entry[common.hours] = ', '.join(
                [m for m in re.findall(r'<li>(.+?)</li>', o_str)])

        brand_s = sub_html.find('<ul class="brands clearfix">')
        if brand_s != -1:
            brand_e = sub_html.find('</ul>', brand_s)
            b_str = sub_html[brand_s:brand_e]
            entry[common.store_type] = ', '.join([
                common.html2plain(m)
                for m in re.findall(r'<li><a href=".*?">(.+?)</a></li>', b_str)
            ])

        # Geo
        if 'state' in data:
            entry[common.province_e] = data['state']
        country_e = data['name'].strip().upper()
        entry[common.country_e] = country_e
        gs.field_sense(entry)

        print '%s Found store: %s, %s (%s, %s)' % (
            brandname_e, entry[common.name_e], entry[common.addr_e],
            entry[common.country_e], entry[common.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    return store_list
示例#60
0
    m = re.search(
        ur'json_init_map\s*=\s*\["(-?\d+\.?\d*)"\s*,\s*"(-?\d+\.?\d*)"', body)
    if m is not None:
        entry[cm.lat] = string.atof(m.group(1))
        entry[cm.lng] = string.atof(m.group(2))

    start = body.find(ur'<div class="box-testuale-right">')
    if start == -1:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []
    sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]
    m = re.search(ur'<div class="box-adress-store">(.+?)</div>', sub, re.S)
    if m is None:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []
    entry[cm.addr_e] = cm.reformat_addr(m.group(1))
    m = re.search(ur'<h4>(.+?)</h4>', sub)
    if m is not None and 't:' in m.group(1).lower():
        entry[cm.tel] = cm.extract_tel(m.group(1))
    m = re.search(ur'<div class="box-open-store">(.+?)</div>', body, re.S)
    if m is not None:
        entry[cm.hours] = cm.reformat_addr(m.group(1))

    ret = None
    if entry[cm.lat] != '' and entry[cm.lng] != '':
        ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng]))
    if ret is None:
        ret = gs.geocode(', '.join((entry[cm.addr_e], data['zone'])))
    if ret is not None:
        city = ''
        province = ''