Пример #1
0
def parse_index(ex, type_, content, conf):
    text = content.decode(conf['encoding'], 'ignore')
    for values in re.compile(conf['detail'], re.DOTALL).findall(text):
        d = {
            key: re.sub(r'(</?[a-zA-Z]+>|\s+)', '', value.strip())
            for key, value in zip(conf['fields'], values)
        }
        if 'relative' in conf and not d['url'].startswith('http'):
            d['url'] = conf['relative'] + d['url']
        if Announcement.query_one({'url': d['url']}):
            continue

        if ex.abbr == '中港邮币卡':
            d['published_at'] = re.sub('<[^>]*>', '-', d['published_at'])
        if ex.abbr == '三点零':
            pa = d['published_at']
            pa = re.sub('<[^>]*>', '', pa)
            d['published_at'] = pa[2:] + '/' + pa[:2]
        d['published_at'] = parse_datetime(d['published_at']) \
            - timedelta(hours=8)
        d['exchange'] = ex._id
        d['type_'] = type_
        content = session.get(d['url'], timeout=(5, 10)).content
        d['html'] = content.decode(conf['encoding'], 'ignore')
        d['html'] = d['html'].replace(conf['encoding'], 'utf-8')
        log.info('[{exchange}]{published_at}: {title}'.format(**d))
        Announcement(d).upsert()