Exemplo n.º 1
0
 def log_uncommitted(state, shift, left_side_shift, right_side_shift,
                     search_offset):
     logging.debug(
         '{0}-{1}: shift: {2:0.5f} [{3:0.5f}, {4:0.5f}], search offset: {5:0.6f}'
         .format(format_time(state["start_time"]),
                 format_time(state["end_time"]), shift, left_side_shift,
                 right_side_shift, search_offset))
Exemplo n.º 2
0
def fix_near_borders(events):
    """
    We assume that all lines with diff greater than 5 * (median diff across all events) are broken
    """
    def fix_border(event_list, median_diff):
        last_ten_diff = np.median([x.diff for x in event_list[:10]],
                                  overwrite_input=True)
        diff_limit = min(last_ten_diff, median_diff)
        broken = []
        for event in event_list:
            if not 0.2 < (event.diff / diff_limit) < 5:
                broken.append(event)
            else:
                for x in broken:
                    x.link_event(event)
                return len(broken)
        return 0

    median_diff = np.median([x.diff for x in events], overwrite_input=True)

    fixed_count = fix_border(events, median_diff)
    if fixed_count:
        logging.debug('Fixing {0} border events right after {1}'.format(
            fixed_count, format_time(events[0].start)))

    fixed_count = fix_border(list(reversed(events)), median_diff)
    if fixed_count:
        logging.debug('Fixing {0} border events right before {1}'.format(
            fixed_count, format_time(events[-1].end)))
Exemplo n.º 3
0
Arquivo: sushi.py Projeto: tp7/Sushi
def fix_near_borders(events):
    """
    We assume that all lines with diff greater than 5 * (median diff across all events) are broken
    """
    def fix_border(event_list, median_diff):
        last_ten_diff = np.median([x.diff for x in event_list[:10]], overwrite_input=True)
        diff_limit = min(last_ten_diff, median_diff)
        broken = []
        for event in event_list:
            if not 0.2 < (event.diff / diff_limit) < 5:
                broken.append(event)
            else:
                for x in broken:
                    x.link_event(event)
                return len(broken)
        return 0

    median_diff = np.median([x.diff for x in events], overwrite_input=True)

    fixed_count = fix_border(events, median_diff)
    if fixed_count:
        logging.info('Fixing {0} border events right after {1}'.format(fixed_count, format_time(events[0].start)))

    fixed_count = fix_border(list(reversed(events)), median_diff)
    if fixed_count:
        logging.info('Fixing {0} border events right before {1}'.format(fixed_count, format_time(events[-1].end)))
Exemplo n.º 4
0
def split_broken_groups(groups):
    correct_groups = []
    broken_found = False
    for g in groups:
        std = np.std([e.shift for e in g])
        if std > MAX_GROUP_STD:
            logging.warn(
                u'Shift is not consistent between {0} and {1}, most likely chapters are wrong (std: {2}). '
                u'Switching to automatic grouping.'.format(
                    format_time(g[0].start), format_time(g[-1].end), std))
            correct_groups.extend(detect_groups(g))
            broken_found = True
        else:
            correct_groups.append(g)

    if broken_found:
        groups_iter = iter(correct_groups)
        correct_groups = [list(next(groups_iter))]
        for group in groups_iter:
            if abs_diff(correct_groups[-1][-1].shift, group[0].shift) >= ALLOWED_ERROR \
                    or np.std([e.shift for e in group + correct_groups[-1]]) >= MAX_GROUP_STD:
                correct_groups.append([])

            correct_groups[-1].extend(group)
    return correct_groups
Exemplo n.º 5
0
def split_broken_groups(groups, min_auto_group_size):
    correct_groups = []
    broken_found = False
    for g in groups:
        std = np.std([e.shift for e in g])
        if std > MAX_GROUP_STD:
            logging.warn(
                u'Shift is not consistent between {0} and {1}, most likely chapters are wrong (std: {2}). '
                u'Switching to automatic grouping.'.format(
                    format_time(g[0].start), format_time(g[-1].end), std))
            correct_groups.extend(detect_groups(g, min_auto_group_size))
            broken_found = True
        else:
            correct_groups.append(g)

    if broken_found:
        correct_groups.sort(key=lambda g: g[0].start)

        i = 0
        while i < len(correct_groups) - 1:
            if abs_diff(correct_groups[i][-1].shift, correct_groups[i + 1][0].shift) < ALLOWED_ERROR \
                    and np.std([e.shift for e in correct_groups[i] + correct_groups[i + 1]]) < MAX_GROUP_STD:
                correct_groups[i].extend(correct_groups[i + 1])
                del correct_groups[i + 1]
            else:
                i += 1

    return correct_groups
Exemplo n.º 6
0
def snap_groups_to_keyframes(events, chapter_times, max_ts_duration,
                             max_ts_distance, src_keytimes, dst_keytimes,
                             src_timecodes, dst_timecodes, max_kf_distance,
                             kf_mode):
    if not max_kf_distance:
        return

    groups = merge_short_lines_into_groups(events, chapter_times,
                                           max_ts_duration, max_ts_distance)

    if kf_mode == 'all' or kf_mode == 'shift':
        #  step 1: snap events without changing their duration. Useful for some slight audio imprecision correction
        shifts = []
        times = []
        for group in groups:
            shifts.extend(
                find_keyframe_shift(group, src_keytimes, dst_keytimes,
                                    src_timecodes, dst_timecodes,
                                    max_kf_distance))
            times.extend((group[0].shifted_start, group[-1].shifted_end))

        shifts = interpolate_nones(shifts, times)
        if shifts:
            mean_shift = np.mean(shifts)
            shifts = zip(*(iter(shifts), ) * 2)

            logging.debug('Group {0}-{1} corrected by {2}'.format(
                format_time(events[0].start), format_time(events[-1].end),
                mean_shift))
            for group, (start_shift, end_shift) in izip(groups, shifts):
                if abs(start_shift - end_shift) > 0.001 and len(group) > 1:
                    actual_shift = min(start_shift,
                                       end_shift,
                                       key=lambda x: abs(x - mean_shift))
                    logging.warning(
                        "Typesetting group at {0} had different shift at start/end points ({1} and {2}). Shifting by {3}."
                        .format(format_time(group[0].start), start_shift,
                                end_shift, actual_shift))
                    for e in group:
                        e.adjust_shift(actual_shift)
                else:
                    for e in group:
                        e.adjust_additional_shifts(start_shift, end_shift)

    if kf_mode == 'all' or kf_mode == 'snap':
        # step 2: snap start/end times separately
        for group in groups:
            if len(group) > 1:
                pass  # we don't snap typesetting
            start_shift, end_shift = find_keyframes_distances(
                group[0], src_keytimes, dst_keytimes, src_timecodes,
                max_kf_distance)
            if abs(start_shift) > 0.01 or abs(end_shift) > 0.01:
                logging.debug(
                    'Snapping {0} to keyframes, start time by {1}, end: {2}'.
                    format(format_time(group[0].start), start_shift,
                           end_shift))
                group[0].adjust_additional_shifts(start_shift, end_shift)
Exemplo n.º 7
0
def get_store_list(data):
    """
    返回店铺列表,其中店铺包含国家信息。
    :rtype : [{'name':'store name', 'url':'http://...', 'city':'NEW YORK', 'country:':'AUSTRALIA'}, ...]
    :param data:
    """
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.finditer(ur'<ul>\s+?<h3 class="country-name">(.+?)</h3>', html, re.S):
        sub, start, end = cm.extract_closure(html[m.start():], ur'<ul>', ur'</ul>')
        if end == 0:
            continue
            # 得到不同国家的分割
        splits = [[m1.start(), m1.group(1)] for m1 in re.finditer(ur'<h3 class="country-name">(.+?)</h3>', sub)]
        splits.append([-1, ''])
        for i in xrange(len(splits) - 1):
            # 在同一个国家下寻找
            sub1 = sub[splits[i][0]:splits[i + 1][0]]
            country = splits[i][1].upper()
            for m1 in re.findall(ur'<li>\s*?<a href="(http://us.christianlouboutin.com/us_en/storelocator/\S+?)">'
                                 ur'(.+?)</a>,(.+?)</li>', sub1):
                store_list.append({'name': m1[1].strip(), 'url': m1[0], 'city': m1[2].strip().upper(),
                                   'country': country})
Exemplo n.º 8
0
def fetch_countries(data):
    """
    获得国家列表
    :param data:
    :return:
    """
    url = data['home_url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    country_list = []
    for m in re.findall(ur'<option value="(\d+)" class="3">(.+?)</option>',
                        html):
        d = data.copy()
        d['country_id'] = string.atoi(m[0])
        d['country_e'] = m[1].strip().upper()
        country_list.append(d)
Exemplo n.º 9
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.post_data(url, {'rsp': 'json', 'country': data['country_code']})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    raw = json.loads(body)
    store_list = []
    for s in raw['stores']:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.name_e] = cm.html2plain(s['name']).strip()

        addr_list = []
        for key in ['address1', 'address2']:
            if s[key].strip() != '':
                addr_list.append(cm.reformat_addr(s[key]))
        entry[cm.addr_e] = ' '.join(addr_list)

        # r=s['region'].strip().upper()
        # m = re.search(ur'\b([A-Z]{2})\b', r)
        # if data[cm.country_e]=='UNITED STATES' and m is not None:
        #     # 美国
        #     ret = gs.look_up(m.group(1), 2)
        #     if ret is not None:
        #         r = ret['name_e']
        # entry[cm.province_e] = r

        entry[cm.city_e] = cm.extract_city(s['city'])[0]
        entry[cm.zip_code] = s['zip'].strip()
        entry[cm.country_e] = data[cm.country_e]
        entry[cm.lat] = string.atof(s['lat'])
        entry[cm.lng] = string.atof(s['lng'])
        entry[cm.tel] = s['phone'].strip()
        entry[cm.fax] = s['fax'].strip()
        entry[cm.email] = s['emailaddress'].strip()
        entry[cm.url] = s['website'].strip()

        days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
        opening = []
        if 'openingHours' in s and s['openingHours'] is not None:
            for m in re.finditer(ur'i:(\d);s:\d+:\\?"([^\\"]+?)\\?"', s['openingHours']):
                opening.append('%s: %s' % (days[string.atoi(m.group(1))], m.group(2).strip()))
            entry[cm.hours] = ', '.join(opening)

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        print '(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'],
                                                              entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e],
                                                              entry[cm.country_e], entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
Exemplo n.º 10
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    body, start, end = cm.extract_closure(body, ur'<article\b', ur'</article>')
    tmp = []
    for m in re.finditer(ur'<h2>\s*(.+?)\s*</h2>', body):
        tmp.append({
            'idx1': m.start(),
            'idx2': m.end(),
            'name': m.group(1).strip().upper()
        })
Exemplo n.º 11
0
def fetch_cities(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching cities: %s' % url, 'shanghaivive_log.txt')
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    values = [{
        'city': u'上海',
        'code': '021'
    }, {
        'city': u'北京',
        'code': '010'
    }, {
        'city': u'成都',
        'code': '028'
    }]
    results = []
    for v in values:
        d = data.copy()
        d['city'] = v['city']
        d['code'] = v['code']
        d['body'] = body
        results.append(d)
    return results
Exemplo n.º 12
0
def fetch_countries(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    # 处理重定向
    m = re.search('<h2>Object moved to <a href="(.+?)">', html)
    if m is not None:
        data['url'] = data['host'] + m.group(1)
        return fetch_countries(data)

    m = re.search('<span class="country">Choose a country</span>', html)
    if m is None:
        return []
    sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>')
    if end == 0:
        return []

    country_list = []
    for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub):
        d = data.copy()
        country_e = cm.html2plain(m[1]).strip().upper()
        ret = gs.look_up(country_e, 1)
        if ret is not None:
            country_e=ret['name_e']
        d['country_e'] = country_e
        d['province_e'] = ''
        d['url'] = data['host'] + m[0]
        country_list.append(d)
    return country_list
Exemplo n.º 13
0
def get_store_list(data):
    """
    获得城市中的商店列表
    :param data:
    :return:
    """
    url = data['url']
    try:
        html = cm.post_data(url, {
            'country': data['country_id'],
            'city': data['city_id'],
            'recordid': -1
        })
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': brand_id
        }
        cm.dump(dump_data)
        return []
    pass

    ret = []
    for m in re.findall(ur'<a href=.+?store-(\d+).+?">', html, re.S):
        store_id = string.atoi(m.strip())
        entry = dict(data)
        entry['store_id'] = store_id
        ret.append(entry)
Exemplo n.º 14
0
def get_countries(data):
    """
    返回国家列表
    :rtype : [{'country_id':**, 'country':**}, ...]
    :param data:
    :return:
    """
    url = data['url']
    try:
        html = cm.post_data(url, {'country': -1, 'city': -1, 'recordit': -1})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': brand_id
        }
        cm.dump(dump_data)
        return []

    ret = []
    for m in re.findall(
            ur'<li>\s*?<a href=.+?country-(\d+).+?">(.+?)<\\/a><\\/li>', html,
            re.S):
        country_id = string.atoi(m[0].strip())
        country = m[1].replace(r'\r', '').replace(r'\n', '').strip().upper()
        ret.append({'country_id': country_id, 'country': country, 'url': url})
Exemplo n.º 15
0
def fetch_continents(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    start = html.find('<select class="select_continente">')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<select\b',
                                         ur'</select>')
    if end == 0:
        return []

    continent_list = []
    for m in re.findall(ur'<option value="(\d+)"\s*>(.+?)</option>', sub):
        d = data.copy()
        d[cm.continent_e] = m[1].strip().upper()
        d['continent_id'] = string.atoi(m[0])
        continent_list.append(d)
Exemplo n.º 16
0
def split_broken_groups(groups, min_auto_group_size):
    correct_groups = []
    broken_found = False
    for g in groups:
        std = np.std([e.shift for e in g])
        if std > MAX_GROUP_STD:
            logging.warn(u'Shift is not consistent between {0} and {1}, most likely chapters are wrong (std: {2}). '
                         u'Switching to automatic grouping.'.format(format_time(g[0].start), format_time(g[-1].end),
                                                                    std))
            correct_groups.extend(detect_groups(g, min_auto_group_size))
            broken_found = True
        else:
            correct_groups.append(g)

    if broken_found:
        correct_groups.sort(key=lambda g: g[0].start)

        i = 0
        while i < len(correct_groups) - 1:
            if abs_diff(correct_groups[i][-1].shift, correct_groups[i + 1][0].shift) < ALLOWED_ERROR \
                    and np.std([e.shift for e in correct_groups[i] + correct_groups[i + 1]]) < MAX_GROUP_STD:
                correct_groups[i].extend(correct_groups[i + 1])
                del correct_groups[i + 1]
            else:
                i += 1

    return correct_groups
Exemplo n.º 17
0
def fetch_uk_ireland(data):
    url = 'http://www.frenchconnection.com/content/stores/united+kingdom.htm'
    try:
        body = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    body, start, end = cm.extract_closure(body, ur'<article\b', ur'</article>')
    start = body.find(ur'<h3>OCEAN TERMINAL</h3>')
    body1 = body[:start]
    start2 = body.find(ur'<h3>FRENCH CONNECTION OUTLET</h3>')
    body2 = body[start + len(ur'<h3>OCEAN TERMINAL</h3>'):start2]
    body3 = body[start2 + len(ur'<h3>FRENCH CONNECTION OUTLET</h3>'):]

    tmp = []
    for m in re.finditer(ur'<h3>\s*(.+?)\s*</h3>', body1):
        tmp.append({
            'idx1': m.start(),
            'idx2': m.end(),
            'name': m.group(1).strip().upper()
        })
Exemplo n.º 18
0
def fetch_countries(data):
    """
    获得国家列表
    :param data:
    :return:
    """
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    country_list = []
    for m in re.findall(
            ur'<li class="Level4">\s*?<a id="_.+?" href="(.+?)">(.+?)</a>\s*?</li>',
            html, re.S):
        data = data.copy()
        data['country_e'] = m[1].strip().upper()
        data['url'] = data['host'] + m[0]
        country_list.append(data)
Exemplo n.º 19
0
def fetch_countries(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    start = html.find('<select name="country" id="inp-country"')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<select\b', ur'</select>')
    if end == 0:
        return []
    country_list = []
    for m in re.findall(ur'<option value="([A-Z]{2})">(.*?)</option>', sub):
        d = data.copy()
        d['country_code'] = m[0]
        d[cm.country_c] = m[1].strip()
        for key in [cm.country_e, cm.continent_e, cm.continent_c]:
            d[key] = ''
        ret = gs.look_up(d['country_code'], 1)
        if ret is not None:
            d[cm.country_e] = ret['name_e']
            d[cm.country_c] = ret['name_c']
            d[cm.continent_c] = ret['continent']['name_c']
            d[cm.continent_e] = ret['continent']['name_e']

        country_list.append(d)
Exemplo n.º 20
0
def fetch(level=1, data=None, user='******', passwd=''):
    # Walk from the root node, where level == 1.
    if data is None:
        data = {
            'url': 'http://www.mido.cn/zh/retailer_li/POS',
            'brand_id': 10260,
            'brandname_e': u'MIDO',
            'brandname_c': u'美度'
        }

    global db
    db = cm.StoresDb()
    db.connect_db(user=user, passwd=passwd)
    db.execute(u'DELETE FROM %s WHERE brand_id=%d' %
               ('stores', data['brand_id']))

    url = data['url']
    try:
        data['html'] = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    store_list = fetch_countries(data)

    db.disconnect_db()
Exemplo n.º 21
0
def get_cities(data):
    try:
        d = {"country": data["country_code"], "city": "", "service": -1}
        html = common.post_data(url, d)
    except Exception:
        print "Error occured in getting the list of countries: %s" % url
        dump_data = {"level": 1, "time": common.format_time(), "data": {"data": url}, "brand_id": brand_id}
        common.dump(dump_data)
        return []

    start = html.find(u'<select id="city" name="city">')
    if start == -1:
        return []
    end = html.find(u"</select>", start)
    html = html[start:end]
    city_list = []
    for m in re.findall(ur'<option value="(.+?)">', html):
        if data["country_code"] == "GB" and "2 davies street" in m.lower():
            continue
        elif data["country_code"] == "RO" and "13 september street" in m.lower():
            continue
        elif "b1603daq" in m.lower():
            continue
        else:
            city_list.append({"city_e": m, "country_e": data["country_e"], "country_code": data["country_code"]})
Exemplo n.º 22
0
def fetch_cities(data):
    url = data['home_url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching geo info: %s' % url, 'samsonite_log.txt')
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    pat = re.compile(ur"if \(currlan == 'ZHT'\)\s*\{.+?\}", re.S)
    body = re.sub(pat, '', body)

    m = re.search(ur'dsy.add\("0",\[(.+?)\]', body)
    if m is None:
        cm.dump('Error in fetching geo info: %s' % url, 'samsonite_log.txt')
        return []
    province_list = [m1 for m1 in re.findall(ur'"(.+?)"', m.group(1))]

    city_list = []
    for m in re.findall(ur'dsy.add\("0_(\d+)",\[(.+?)\]', body):
        for m1 in re.findall(ur'"(.+?)"', m[1]):
            c = data.copy()
            c['province'] = province_list[string.atoi(m[0])]
            c['city'] = m1
            city_list.append(c)
Exemplo n.º 23
0
def fetch_countries(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    # 处理重定向
    m = re.search('<h2>Object moved to <a href="(.+?)">', html)
    if m is not None:
        data['url'] = data['host'] + m.group(1)
        return fetch_countries(data)

    m = re.search('<span class="country">Choose a country</span>', html)
    if m is None:
        return []
    sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>')
    if end == 0:
        return []

    country_list = []
    for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub):
        d = data.copy()
        country_e = cm.html2plain(m[1]).strip().upper()
        ret = gs.look_up(country_e, 1)
        if ret is not None:
            country_e = ret['name_e']
        d['country_e'] = country_e
        d['province_e'] = ''
        d['url'] = data['host'] + m[0]
        country_list.append(d)
    return country_list
Exemplo n.º 24
0
def fetch_cities(data):
    url = data['sel_url']
    try:
        body = cm.post_data(
            url, {
                'continent': data['continent'],
                'country': data['country'],
                'city': '',
                'page': 0
            })
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    raw = json.loads(body)
    city_list = []
    for c in raw['city']:
        d = data.copy()
        d['city'] = c
        city_list.append(d)
    return city_list
Exemplo n.º 25
0
def fetch_cities(data):
    country_id = data['country_id']
    try:
        html = cm.post_data(url, {'country_id': country_id})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 2,
            'time': cm.format_time(),
            'data': data,
            'brand_id': brand_id
        }
        cm.dump(dump_data)
        return []

    start = html.find('<select name="city_id" id="city_id">')
    if start == -1:
        return []
    start += len('<select name="city_id" id="city_id">')
    end = html.find('</select>', start)
    html = html[start:end]
    city_list = []
    for m in re.findall(ur'<option\s.*?value="(\d+).*?">(.*?)</option>', html):
        entry = {'city': m[1].strip().upper(), 'city_id': string.atoi(m[0])}
        entry['country'] = data['country']
        entry['country_id'] = data['country_id']
        city_list.append(entry)
Exemplo n.º 26
0
def fetch_cities(data):
    url = data['home_url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching geo info: %s' % url, 'samsonite_log.txt')
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    pat = re.compile(ur"if \(currlan == 'ZHT'\)\s*\{.+?\}", re.S)
    body = re.sub(pat, '', body)

    m = re.search(ur'dsy.add\("0",\[(.+?)\]', body)
    if m is None:
        cm.dump('Error in fetching geo info: %s' % url, 'samsonite_log.txt')
        return []
    province_list = [m1 for m1 in re.findall(ur'"(.+?)"', m.group(1))]

    city_list = []
    for m in re.findall(ur'dsy.add\("0_(\d+)",\[(.+?)\]', body):
        for m1 in re.findall(ur'"(.+?)"', m[1]):
            c = data.copy()
            c['province'] = province_list[string.atoi(m[0])]
            c['city'] = m1
            city_list.append(c)
Exemplo n.º 27
0
def fetch_store_details(url, data):
    """
    获得门店的详细信息(url下可能有多个门店)
    :rtype : [{}]
    :param url:
    :param data:
    """
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s / %s' % (str(data), url)
        dump_data = {
            'level': 2,
            'time': cm.format_time(),
            'data': data,
            'brand_id': brand_id
        }
        cm.dump(dump_data)
        return []

    # 可能有多个门店,拆分
    sub_html = []
    for m in re.finditer(ur'<li\s+class\s*=\s*"boutique-info-cadre-\d+"\s*>',
                         html):
        start = m.start() + len(m.group())
        end = html.find('</li>', start)
        sub_html.append(html[start:end])
Exemplo n.º 28
0
def get_continents(data):
    """
    返回洲列表
    :rtype : [{'name':u'欧洲', 'url':'http://....'}, ...]
    :param data:
    :return:
    """
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': brand_id
        }
        cm.dump(dump_data)
        return []
    return [{
        'name': m[1],
        'url': m[0]
    } for m in re.findall(
        ur'<a href="(http://us.christianlouboutin.com/us_en/storelocator/\S+)">(.+?)</a>',
        html)]
Exemplo n.º 29
0
def fetch_countries(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    start = body.find(
        u'<option value="0" selected="selected">Select a country</option>')
    if start == -1:
        return []
    end = body.find(u'</select>', start)

    country_list = []
    for m in re.findall(ur'<option value="([A-Z]{2})"[^>]*>(.+?)</option>',
                        body[start:end]):
        d = data.copy()
        # ret=gs.look_up(m[0],1)
        d['country'] = m[1].strip()
        d['country_code'] = m[0]
        country_list.append(d)
Exemplo n.º 30
0
def fetch_cities(data):
    url = data['url']
    try:
        body = cm.post_data(
            url, {
                'searchtype': 'normal',
                'reiter_selected': 'reiter1',
                'country_id': data['country_code'],
                'city_id': 0
            })
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    m = re.search(ur'<option value="0"[^>]*>city</option>', body)
    if m is None:
        return []
    end = body.find(u'</select>', m.end())

    city_list = []
    for c in re.findall(ur'<option value="(.+?)"[^>]*>.+?</option>',
                        body[m.end():end]):
        d = data.copy()
        d['city'] = c
        city_list.append(d)
Exemplo n.º 31
0
def fetch_stores(data):
    # <h2 property="dc:title"
    url = data[cm.url]
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.finditer(ur'<h2 property="dc:title"', html):
        end = html.find('</header>', m.start())
        if end == -1:
            continue
        sub = html[m.start():end]
        m1 = re.search(ur'<a href="(.+?)">(.+?)</a></h2>', sub)
        if m1 is None:
            print 'Error: no more details for %s' % url
            continue
        d = data.copy()
        d[cm.url] = data['host'] + m1.group(1)
        d[cm.name_e] = cm.html2plain(m1.group(2)).strip()
        store_list.append(d)
Exemplo n.º 32
0
def fetch_continents(data):
    url = data['store_url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    start = html.find(u'<select id="continent" name="continent"')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<select\b',
                                         ur'</select')

    continent_list = []
    for m in re.findall(ur'<option value="(.+?)">.+?</option>', sub):
        d = data.copy()
        d['continent'] = m
        continent_list.append(d)
Exemplo n.º 33
0
def get_cities(data):
    try:
        d = {'country': data['country_code'], 'city': '', 'service': -1}
        html = common.post_data(url, d)
    except Exception:
        print 'Error occured in getting the list of countries: %s' % url
        dump_data = {'level': 1, 'time': common.format_time(), 'data': {'data': url}, 'brand_id': brand_id}
        common.dump(dump_data)
        return []

    start = html.find(u'<select id="city" name="city">')
    if start == -1:
        return []
    end = html.find(u'</select>', start)
    html = html[start:end]
    city_list = []
    for m in re.findall(ur'<option value="(.+?)">', html):
        if data['country_code'] == 'GB' and '2 davies street' in m.lower():
            continue
        elif data['country_code'] == 'RO' and '13 september street' in m.lower():
            continue
        elif 'b1603daq' in m.lower():
            continue
        else:
            city_list.append({'city_e': m, 'country_e': data['country_e'], 'country_code': data['country_code']})
Exemplo n.º 34
0
def fetch_stores(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.findall(ur'var markerContent\s*?=\s*?"(.+?)".+?'
                        ur'createMarker\(.+?new google.maps.LatLng\((-?\d+\.\d+),(-?\d+\.\d+)\)', html, re.S):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        lat, lng = map(string.atof, [m[1], m[2]])
        cm.update_entry(entry, {cm.lat: lat, cm.lng: lng})

        sub = m[0].strip()
        m1 = re.search(ur'<b>(.+?)</b>', sub)
        if m1 is None:
            continue
        entry[cm.name_c] = m1.group(1)
        sub = sub.replace(m1.group(0), '')
        m1=re.search(ur'聯系電話(?::|:)(.+?)<', sub)
        if m1 is not None:
            entry[cm.tel]=m1.group(1)
            sub=sub.replace(m1.group(0), '<')
        sub = re.sub(ur'<img\b.*?/>', '', sub)
        entry[cm.addr_c] = cm.reformat_addr(sub)

        print '(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                        entry[cm.name_c], entry[cm.addr_e], entry[cm.country_e],
                                                        entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
Exemplo n.º 35
0
def fetch_cities(data):
    url = data['host'] + '/ajax/esiajaxProxy.asp'
    try:
        body = cm.get_data(
            url, {
                'c': 'FF_StoreLocator2',
                'm': 'getCountiesAjax',
                'ws': 'ch-ch',
                'pid': 178,
                'cid': data['country_code'],
                'CT': 0
            })
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    results = []
    for m in re.findall(ur'<li><a href="" data-value="(.+?)">', body):
        d = data.copy()
        d['city'] = m
        results.append(d)
Exemplo n.º 36
0
def fetch_store_list(data):
    url = data['url']
    try:
        body = cm.post_data(url, {
            'cCode': data['country_code'],
            'city': data['city'],
            'postsearch': 1
        })
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    results = []
    for m in re.finditer(ur'<td class\s*=\s*"ftd"', body):
        end = body.find('</tr>', m.start())
        sub = body[m.start():end]
        m1 = re.search(ur'<td class="ltd"><a href="(.+?)">', sub)
        if m1 is None:
            print 'Cannot find details: %s / %s' % (data['country_code'],
                                                    data['city'])
        else:
            d = data.copy()
            d['url'] = data['host'] + m1.group(1)
            results.append(d)
Exemplo n.º 37
0
Arquivo: sushi.py Projeto: tp7/Sushi
def groups_from_chapters(events, times):
    logging.info(u'Chapter start points: {0}'.format([format_time(t) for t in times]))
    groups = [[]]
    chapter_times = iter(times[1:] + [36000000000])  # very large event at the end
    current_chapter = next(chapter_times)

    for event in events:
        if event.end > current_chapter:
            groups.append([])
            while event.end > current_chapter:
                current_chapter = next(chapter_times)

        groups[-1].append(event)

    groups = filter(None, groups)  # non-empty groups
    # check if we have any groups where every event is linked
    # for example a chapter with only comments inside
    broken_groups = [group for group in groups if not any(e for e in group if not e.linked)]
    if broken_groups:
        for group in broken_groups:
            for event in group:
                parent = event.get_link_chain_end()
                parent_group = next(group for group in groups if parent in group)
                parent_group.append(event)
            del group[:]
        groups = filter(None, groups)
        # re-sort the groups again since we might break the order when inserting linked events
        # sorting everything again is far from optimal but python sorting is very fast for sorted arrays anyway
        for group in groups:
            group.sort(key=lambda event: event.start)

    return groups
Exemplo n.º 38
0
def fetch_cities(data):
    url = data['home_url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching cities: %s' % url, 'canali_log.txt')
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    start = body.find(u'<nav class="countrySelector">')
    if start == -1:
        cm.dump('Error occured in fetching country list: %s' % url, 'canali_log.txt')
    body = cm.extract_closure(body[start:], ur'<nav\b', ur'</nav>')[0]

    results = []
    for m in re.finditer(ur'<li><a href=".+?">(.+?)</a>', body):
        country = m.group(1).strip().upper()
        sub = cm.extract_closure(body[m.end():], ur'<ul\b', ur'</ul>')[0]
        for m1 in re.findall(ur'<li><a class=".+?" href="(.+?)">(.+?)</a></li>', sub):
            d = data.copy()
            d['country'] = country
            d['url'] = data['host'] + m1[0]
            d['city'] = m1[1].strip().upper()
            results.append(d)
Exemplo n.º 39
0
def get_frag_countries(url):
    # 获得国家代码
    """
    获得国家的名字和代码
    :rtype : [{'id':**, 'country':**}, ...]
    :param url:
    :return:
    """
    try:
        html = common.get_data(url)
    except Exception:
        print 'Error occured: %s' % url_fragrance
        dump_data = {
            'level': 1,
            'time': common.format_time(),
            'data': {
                'url': url_fragrance
            },
            'brand_id': brand_id
        }
        common.dump(dump_data)
        return [], False

    start = html.find('<select name="country" id="id_country">')
    if start == -1:
        return [], False
    sub, s, e = common.extract_closure(html[start:], ur'<select\b',
                                       ur'</select>')
    if e == 0:
        return [], False
    return [{
        'id': string.atoi(m[0]),
        'country': m[1].strip().upper()
    } for m in re.findall(ur'<option value="(\d+)".*?>(.+?)</option>', sub)]
Exemplo n.º 40
0
def get_frag_countries(url):
    # 获得国家代码
    """
    获得国家的名字和代码
    :rtype : [{'id':**, 'country':**}, ...]
    :param url:
    :return:
    """
    try:
        html = common.get_data(url)
    except Exception:
        print 'Error occured: %s' % url_fragrance
        dump_data = {'level': 1, 'time': common.format_time(), 'data': {'url': url_fragrance},
                     'brand_id': brand_id}
        common.dump(dump_data)
        return [], False

    start = html.find('<select name="country" id="id_country">')
    if start == -1:
        return [], False
    sub, s, e = common.extract_closure(html[start:], ur'<select\b', ur'</select>')
    if e == 0:
        return [], False
    return [{'id': string.atoi(m[0]), 'country': m[1].strip().upper()}
            for m in re.findall(ur'<option value="(\d+)".*?>(.+?)</option>', sub)]
Exemplo n.º 41
0
def fetch_cities(data):
    url = data['home_url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching cities: %s' % url, 'unode50_log.txt')
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    m = re.search(ur'countries\s*=\s*\{', body)
    if m is None:
        cm.dump('Error in fetching cities: %s' % url, 'unode50_log.txt')
        return []
    body = cm.extract_closure(body[m.start():], ur'\{', ur'\}')[0]
    raw = json.loads(body)
    results = []
    for key in raw:
        d = data.copy()
        d['country'] = raw[key]['name'].strip().upper()
        d['country_id'] = key
        results.append(d)
    return results
Exemplo n.º 42
0
def get_store_details(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
    entry[cm.name_e] = data['name']
    entry[cm.url] = data['url']
    start = html.find(ur'<div class="storelocator-breadcrumbs">')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<ul>', ur'</ul>')
    if end == 0:
        return []
        # 最后一个<li>...</li>
    m = re.findall(ur'<li>(.+?)</li>', sub, re.S)
    if len(m) > 0:
        entry[cm.addr_e] = cm.reformat_addr(m[-1])
        # 经纬度
    m = re.findall(ur'position: new google.maps.LatLng\((-?\d+\.\d+).*?(-?\d+\.\d+)\)', html)
    if len(m) > 0:
        cm.update_entry(entry, {cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1])})

    m = re.search(ur'<div class="contact right">(.+?)</div>', html, re.S)
    if m is not None:
        contact_sub = m.group(1)
        pat_tel = re.compile(ur'<p class="phone">(.+?)</p>')
        m1 = re.search(pat_tel, contact_sub)
        if m1:
            entry[cm.tel] = cm.extract_tel(m1.group(1))
            contact_sub = re.sub(pat_tel, '', contact_sub)
        hours_list=[tmp.strip() for tmp in cm.reformat_addr(contact_sub).split(',')]
        if 'opening hours' in hours_list[0].lower():
            del hours_list[0]
        entry[cm.hours] = ', '.join(hours_list)

    # Geo
    country = data['country']
    city = data['city']
    cm.update_entry(entry, {cm.country_e: country, cm.city_e: city})
    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    print '(%s / %d) Found store: %s, %s (%s, %s)' % (
        brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
        entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return entry
Exemplo n.º 43
0
def fetch_stores(data):
    """
    获得商店信息
    :param data:
    :return:
    """
    url = data["post_url"]
    try:

        js = json.loads(
            cm.post_data(
                url, {"country_id": data["country_id"], "retail_city": "", "retail_type": data["retail_type"]}
            ).decode("unicode_escape")
        )
    except Exception:
        print "Error occured in getting country list: %s" % url
        dump_data = {"level": 1, "time": cm.format_time(), "data": {"url": url}, "brand_id": data["brand_id"]}
        cm.dump(dump_data)
        return []

    # country_id=108&retail_city=&retail_type=retail
    # country_id=99&retail_city=&retail_type=service
    store_list = []
    for s in js:
        entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"])
        tmp = s["retail_name"].strip()
        if cm.is_chinese(tmp):
            entry[cm.name_c] = tmp
        else:
            entry[cm.name_e] = tmp
        entry[cm.addr_e] = s["retail_gmap"].strip()
        entry[cm.zip_code] = s["retail_zipcode"].strip()
        entry[cm.city_e] = s["retail_city"].strip().upper()
        if s["retail_email"] is not None:
            entry[cm.email] = s["retail_email"].strip()
        if s["retail_website"] is not None:
            entry[cm.url] = s["retail_website"].strip()
        if data["retail_type"] == "retail":
            entry[cm.store_class] = "Retail"
        else:
            entry[cm.store_class] = "Service Center"
        entry[cm.country_e] = s["country_name"].strip().upper()
        entry[cm.continent_e] = s["continent_name"].strip().upper()

        gs.field_sense(entry)
        print "(%s / %d) Found store: %s, %s (%s, %s)" % (
            data["brandname_e"],
            data["brand_id"],
            entry[cm.name_e],
            entry[cm.addr_e],
            entry[cm.country_e],
            entry[cm.continent_e],
        )
        store_list.append(entry)
        db.insert_record(entry, "stores")

    return store_list
Exemplo n.º 44
0
def fetch_uk_home(data):
    url = 'http://www.frenchconnection.com/content/stores/united+kingdom.htm'
    try:
        body = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []
Exemplo n.º 45
0
def fetch_stores(data):
    """
    获得门店的详细信息
    :rtype : [entries]
    :param data:
    """
    try:
        html = cm.get_data(data['url'])
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    entries = []
    start = html.find(u'<ul class="store-list">')
    if start == -1:
        return entries
    start += len(u'<ul class="store-list">')
    end = html.find(u'</ul>', start)
    html = html[start:end]

    for m1 in re.findall(ur'<li class="(.*?)">(.*?)</li>', html, re.S):
        store = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        store[cm.store_type] = m1[0]
        sub_html = m1[1]
        m2 = re.findall(ur'<h3 class="store-name">(.*?)</h3>', sub_html)
        if len(m2) > 0:
            store[cm.name_e] = cm.reformat_addr(m2[0])
        m2 = re.findall(ur'<p class="store-address">(.*?)</p>', sub_html, re.S)
        if len(m2) > 0:
            store[cm.addr_e] = cm.reformat_addr(m2[0])

        cm.update_entry(store, {cm.continent_e: data[cm.continent_e].strip().upper(),
                                cm.country_e: data[cm.country_e].strip().upper(),
                                cm.city_e: data[cm.city_e].strip().upper()})

        entry = store
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

        print '%s: Found store: %s, %s (%s, %s)' % (
            brandname_e, store[cm.name_e], store[cm.addr_e], store[cm.country_e],
            store[cm.continent_e])
        db.insert_record(store, 'stores')
        entries.append(store)
Exemplo n.º 46
0
Arquivo: sushi.py Projeto: tp7/Sushi
def snap_groups_to_keyframes(events, chapter_times, max_ts_duration, max_ts_distance, src_keytimes, dst_keytimes,
                             src_timecodes, dst_timecodes, max_kf_distance, kf_mode):
    if not max_kf_distance:
        return

    groups = merge_short_lines_into_groups(events, chapter_times, max_ts_duration, max_ts_distance)

    if kf_mode == 'all' or kf_mode == 'shift':
        #  step 1: snap events without changing their duration. Useful for some slight audio imprecision correction
        shifts = []
        times = []
        for group in groups:
            shifts.extend(find_keyframe_shift(group, src_keytimes, dst_keytimes, src_timecodes, dst_timecodes, max_kf_distance))
            times.extend((group[0].shifted_start, group[-1].shifted_end))

        shifts = interpolate_nones(shifts, times)
        if shifts:
            mean_shift = np.mean(shifts)
            shifts = zip(*(iter(shifts), ) * 2)

            logging.info('Group {0}-{1} corrected by {2}'.format(format_time(events[0].start), format_time(events[-1].end), mean_shift))
            for group, (start_shift, end_shift) in izip(groups, shifts):
                if abs(start_shift-end_shift) > 0.001 and len(group) > 1:
                    actual_shift = min(start_shift, end_shift, key=lambda x: abs(x - mean_shift))
                    logging.warning("Typesetting group at {0} had different shift at start/end points ({1} and {2}). Shifting by {3}."
                                    .format(format_time(group[0].start), start_shift, end_shift, actual_shift))
                    for e in group:
                        e.adjust_shift(actual_shift)
                else:
                    for e in group:
                        e.adjust_additional_shifts(start_shift, end_shift)

    if kf_mode == 'all' or kf_mode == 'snap':
        # step 2: snap start/end times separately
        for group in groups:
            if len(group) > 1:
                pass  # we don't snap typesetting
            start_shift, end_shift = find_keyframes_distances(group[0], src_keytimes, dst_keytimes, src_timecodes, max_kf_distance)
            if abs(start_shift) > 0.01 or abs(end_shift) > 0.01:
                logging.info('Snapping {0} to keyframes, start time by {1}, end: {2}'.format(format_time(group[0].start), start_shift, end_shift))
                group[0].adjust_additional_shifts(start_shift, end_shift)
Exemplo n.º 47
0
def fetch_stores(data):
    """
    获得商店信息
    :param data:
    """
    url = data['url']
    try:
        info = json.loads(cm.get_data(url, {'tskay': data['key_term']}))
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    raw_list = info['shops']
    store_list = []
    for s in raw_list:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.city_e] = s['city'].strip().upper()
        entry[cm.country_e] = data['country_e'].strip().upper()
        entry[cm.name_e] = s['name'].strip()
        addr = s['address']
        entry[cm.addr_e] = addr

        terms = addr.split(',')
        if len(terms) > 1 and entry[cm.city_e] in terms[-1].strip().upper():
            country = entry['country_e']
            tmp = gs.look_up(country, 1)
            if tmp is not None:
                country = tmp['name_e']
            if country == 'JAPAN':
                # 日本邮编
                m = re.search(ur'\d{3,}[ -\.]+?\d{3,}', terms[-1])
                if m is not None:
                    entry[cm.zip_code] = m.group(0)
            else:
                m = re.search(ur'\d{4,}', terms[-1])
                if m is not None:
                    entry[cm.zip_code] = m.group(0)

        entry[cm.tel] = s['tel']
        entry[cm.fax] = s['fax']
        entry[cm.email] = s['email']
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                          entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                          entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    return store_list
Exemplo n.º 48
0
def get_stores(data):
    # data[StoreLocator][pays]=BO
    url = data['url']
    try:
        html = cm.post_data(url, {'data[StoreLocator][pays]': data['country_code'],
                                  'data[StoreLocator][ville]': '',
                                  'data[StoreLocator][etat]': 0})
    except Exception, e:
        print 'Error occured: %s, %s' % (url, str(e))
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []
Exemplo n.º 49
0
def fetch_stores(data):
    url = data['post_shops']
    param = {'city': data['city_e'], 'paulandjoe_women': 0, 'paulandjoe_man': 0,
             'paulandjoe_sister': 0, 'paulandjoe_little': 0, 'paulandjoe_beauty': 0}
    try:
        html = cm.post_data(url, param)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    try:
        for store in (pq(tmp) for tmp in pq(html)('ul')):
            try:
                entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
                entry[cm.name_e] = cm.html2plain(store('li.first')[0].text).strip()
                entry[cm.country_e] = data[cm.country_e]
                entry[cm.city_e] = data[cm.city_e]

                addr_list = []
                for term in (cm.reformat_addr(unicode(pq(tmp))) for tmp in store('li[class!="first"]')):
                    if term != '':
                        addr_list.append(term)
                tel = cm.extract_tel(addr_list[-1])
                if tel != '':
                    entry[cm.tel] = tel
                    del addr_list[-1]
                entry[cm.addr_e] = ', '.join(addr_list)

                gs.field_sense(entry)
                ret = gs.addr_sense(entry[cm.addr_e])
                if ret[0] is not None and entry[cm.country_e] == '':
                    entry[cm.country_e] = ret[0]
                if ret[1] is not None and entry[cm.province_e] == '':
                    entry[cm.province_e] = ret[1]
                if ret[2] is not None and entry[cm.city_e] == '':
                    entry[cm.city_e] = ret[2]
                gs.field_sense(entry)
                print '(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                                entry[cm.continent_e])
                store_list.append(entry)
                db.insert_record(entry, 'stores')
            except (IndexError, TypeError) as e:
                cm.dump(u'Error in parsing %s, %s' % (url, param), log_name)
                print traceback.format_exc()
                continue
    except Exception, e:
        print traceback.format_exc()
Exemplo n.º 50
0
def fetch_stores(data):
    url = data["url"]
    try:
        body = cm.get_data(url)
    except Exception:
        print "Error occured: %s" % url
        dump_data = {"level": 0, "time": cm.format_time(), "data": {"url": url}, "brand_id": data["brand_id"]}
        cm.dump(dump_data)
        return []

    if data["name"] == "UK" or data["name"] == "US" or data["name"] == "JAPAN" or data["name"] == "AUSTRALIA":
        return fetch_uk(body, data)
    else:
        return fetch_world(body, data)
Exemplo n.º 51
0
def get_coordinates(url):
    try:
        html = common.get_data(url)
    except Exception:
        print 'Error occured in retrieving the coordinates: %s' % url
        dump_data = {'level': 2, 'time': common.format_time(), 'data': {'data': url}, 'brand_id': brand_id}
        common.dump(dump_data)
        return []

    m = re.findall(ur'new google.maps.LatLng\(\s*?(-?\d+\.\d+)\s*?,\s*?(-?\d+\.\d+)\s*?\)', html)
    if len(m) > 0:
        return [string.atof(m[0][0]), string.atof(m[0][1])]
    else:
        return ['', '']
Exemplo n.º 52
0
def fetch_indv(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    body, start, end = cm.extract_closure(body, ur'<article\b', ur'</article>')
    tmp = []
    for m in re.finditer(ur'<h2>\s*(.+?)\s*</h2>', body):
        tmp.append({'idx1': m.start(), 'idx2': m.end(), 'name': m.group(1).strip().upper()})
Exemplo n.º 53
0
def fetch_store_list(url):
    """
    获得门店的列表
    :rtype : 门店列表。格式:[{'name':**, 'lat':**, 'lng':**, 'type':**, 'url':**}]
    :param url: 
    """
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'data': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    # 开始解析工作
    # 查找数据部分,位于var items和var\s\w+之间
    start = html.find('var items')
    if start == -1:
        return {}
    start += len('var items')
    end = html.find('var ', start)
    html = html[start:end]
    stores = []

    pattern = ur'\[(.+?)\]'
    store_list = []
    for m in re.findall(pattern, html, re.S):
        store_entry = {}
        m_list = re.findall(ur"'(.*)'", m)
        try:
            store_entry['name'] = cm.html2plain(m_list[0].strip())
            store_entry['type'] = m_list[2].strip()
            store_entry['url'] = m_list[4].strip()
        except IndexError:
            print 'Index error: %s' % m
            # 去掉引号之间的内容,准备查找经纬度信息
        m_list = re.findall(ur'(-?\d+\.\d+)', re.subn(ur"'(.*)'", '', m)[0])
        try:
            lat = string.atof(m_list[0])
            lng = string.atof(m_list[1])
            store_entry['lat'] = lat
            store_entry['lng'] = lng
        except (IndexError, ValueError):
            print 'Index error in getting coordinates: %s' % m

        # test
        # if 'hong-kong' in store_entry['url'] or 'taichung' in store_entry['url']:
        if len(store_entry.keys()) > 0:
            store_list.append(store_entry)
    return store_list
Exemplo n.º 54
0
def fetch_details(data):
    url = data[cm.url]
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    entry[cm.name_e] = data[cm.name_e]
    start = html.find(ur'<div class="field-address">')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<div\b', ur'</div>')
    if end == 0:
        return []
    m1 = re.search(ur'<div  class="locality">(.+?)</div>', sub)
    if m1 is not None:
        entry[cm.city_e] = cm.extract_city(m1.group(1))[0]
    m1 = re.search(ur'<div  class="postal-code">(.+?)</div>', sub)
    if m1 is not None:
        entry[cm.zip_code] = m1.group(1).strip()
    entry[cm.country_e] = data[cm.country_e]
    pat = re.compile(ur'<[^<>]+?>', re.S)
    entry[cm.addr_e] = cm.reformat_addr(re.sub(pat, u'\r\n', sub))

    m1 = re.search(ur'<div class="field-telephone"><a href=".+?" class="tel">(.+?)</a></div>', html)
    if m1 is not None:
        entry[cm.tel] = m1.group(1).strip()

    m1 = re.search(ur'<div class="field-opening-hours">\s*<p>(.+?)</p>\s*</div>', html, re.S)
    if m1 is not None:
        entry[cm.hours] = cm.reformat_addr(m1.group(1))

    m1 = re.search(ur'"coordinates":\[(-?\d+\.\d{4,})\s*,\s*(-?\d+\.\d{4,})\]', html)
    if m1 is not None:
        lat = string.atof(m1.group(1))
        lng = string.atof(m1.group(2))
        cm.update_entry(entry, {cm.lat: lat, cm.lng: lng})

    entry[cm.continent_e] = data[cm.continent_e]
    gs.field_sense(entry)
    print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                      entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                      entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return [entry]
Exemplo n.º 55
0
def fetch_stores(data):
    url = data['store_url']
    try:
        body = cm.get_data(url, {'country': data['country'], 'city': data['city']})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    raw = json.loads(body)
    store_list = []

    for item in raw['items']:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['country'].strip().upper()
        tmp = cm.extract_city(data['city'])[0]
        if entry[cm.country_e] == 'USA':
            entry[cm.province_e] = tmp
        else:
            entry[cm.city_e] = tmp
        gs.field_sense(entry)

        addr = cm.reformat_addr(item['address'].replace(u'\\', ''))
        addr_list = [tmp.strip() for tmp in addr.split(',')]
        tel = cm.extract_tel(addr_list[-1])
        if tel !='':
            entry[cm.tel]=tel
            del addr_list[-1]
        entry[cm.addr_e]=', '.join(addr_list)
        entry[cm.store_type] = item['shop_type']

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                          entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                          entry[cm.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    return store_list
Exemplo n.º 56
0
def fetch_countries(data):
    url = data['home_url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    country_list = []
    for m in re.findall(ur'<option value="[\w ]+?">(.+?)</option>', html):
        d = data.copy()
        d['country_e'] = m
        country_list.append(d)
Exemplo n.º 57
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    result = []
    for m in re.findall(ur'<li class="store">.+?<a href="(.+?)".+?</li>', body, re.S):
        d = data.copy()
        d['url'] = m.strip()
        result.append(d)
Exemplo n.º 58
0
def fetch_cities(data):
    url = data['data_url']
    try:
        body = cm.get_data(url, {'country_code': data['country'], 'toget': 'citylist'})
    except Exception:
        cm.dump('Error in fetching cities: %s, %s' % (url, data['country']), 'benetton_log.txt', False)
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    results = []
    for m in re.findall(ur'<option value=\\"(.+?)\\">', body):
        d = data.copy()
        d['city'] = m.strip().upper()
        results.append(d)