def parse_index(ex, type_, content, conf): text = content.decode(conf['encoding'], 'ignore') for values in re.compile(conf['detail'], re.DOTALL).findall(text): d = {key: re.sub(r'(</?[a-zA-Z]+>|\s+)', '', value.strip()) for key, value in zip(conf['fields'], values)} if 'relative' in conf and not d['url'].startswith('http'): d['url'] = conf['relative'] + d['url'] if Announcement.query_one({'url': d['url']}): continue if ex.abbr == '中港邮币卡': d['published_at'] = re.sub('<[^>]*>', '-', d['published_at']) if ex.abbr == '三点零': pa = d['published_at'] pa = re.sub('<[^>]*>', '', pa) d['published_at'] = pa[2:] + '/' + pa[:2] d['published_at'] = parse_datetime(d['published_at']) \ - timedelta(hours=8) d['exchange'] = ex._id d['type_'] = type_ content = session.get(d['url'], timeout=(5, 10)).content d['html'] = content.decode(conf['encoding'], 'ignore') d['html'] = d['html'].replace(conf['encoding'], 'utf-8') log.info('[{exchange}]{published_at}: {title}'.format(**d)) Announcement(d).upsert()
def parse(): nav = 'parse' url = request.args.get('url') num_parsed = Announcement.count({ 'parsed': True, 'type_': { '$in': ['offer', 'result'] } }) num_total = Announcement.count({'type_': {'$in': ['offer', 'result']}}) if url: announcement = Announcement.query_one({'url': url}) colls = list(Collection.query({'from_url': url})) for coll in colls: if coll.offers_at: coll.offers_at2 = coll.offers_at.strftime('%Y%m%d') if coll.offer_cash_ratio: coll.offer_cash_ratio = '{:2.0f}%'.format( coll.offer_cash_ratio * 100) if coll.offer_price: coll.offer_price = str(coll.offer_price) if coll.offer_price.endswith('.0'): coll.offer_price = coll.offer_price[:-2] all_done = num_parsed == num_total return render_template('admin/parse.html', **locals())
def parse_index(ex, type_, content, conf): text = content.decode(conf['encoding'], 'ignore') for values in re.compile(conf['detail'], re.DOTALL).findall(text): d = { key: re.sub(r'(</?[a-zA-Z]+>|\s+)', '', value.strip()) for key, value in zip(conf['fields'], values) } if 'relative' in conf and not d['url'].startswith('http'): d['url'] = conf['relative'] + d['url'] if Announcement.query_one({'url': d['url']}): continue if ex.abbr == '中港邮币卡': d['published_at'] = re.sub('<[^>]*>', '-', d['published_at']) if ex.abbr == '三点零': pa = d['published_at'] pa = re.sub('<[^>]*>', '', pa) d['published_at'] = pa[2:] + '/' + pa[:2] d['published_at'] = parse_datetime(d['published_at']) \ - timedelta(hours=8) d['exchange'] = ex._id d['type_'] = type_ content = session.get(d['url'], timeout=(5, 10)).content d['html'] = content.decode(conf['encoding'], 'ignore') d['html'] = d['html'].replace(conf['encoding'], 'utf-8') log.info('[{exchange}]{published_at}: {title}'.format(**d)) Announcement(d).upsert()
def parse_findone(): announcement = Announcement.query_one( {"parsed": False, "type_": {"$in": ["offer", "result"]}}, sort=[("published_at", -1)] ) if announcement: return redirect(url_for("admin.parse", url=announcement.url)) else: return redirect(url_for("admin.parse"))
def parse_findone(): announcement = Announcement.query_one( { 'parsed': False, 'type_': { '$in': ['offer', 'result'] } }, sort=[('published_at', -1)]) if announcement: return redirect(url_for('admin.parse', url=announcement.url)) else: return redirect(url_for('admin.parse'))
def parse(): nav = "parse" url = request.args.get("url") num_parsed = Announcement.count({"parsed": True, "type_": {"$in": ["offer", "result"]}}) num_total = Announcement.count({"type_": {"$in": ["offer", "result"]}}) if url: announcement = Announcement.query_one({"url": url}) colls = list(Collection.query({"from_url": url})) for coll in colls: if coll.offers_at: coll.offers_at2 = coll.offers_at.strftime("%Y%m%d") if coll.offer_cash_ratio: coll.offer_cash_ratio = "{:2.0f}%".format(coll.offer_cash_ratio * 100) if coll.offer_price: coll.offer_price = str(coll.offer_price) if coll.offer_price.endswith(".0"): coll.offer_price = coll.offer_price[:-2] all_done = num_parsed == num_total return render_template("admin/parse.html", **locals())