def aggregate_shops(start, end, date=None): try: db = getdb() if date is None: date = defaultdate si = ShopIndex(date) ci = CategoryIndex(date) si.multi() ci.multi() shopids = set() with db.connection() as cur: cur.execute( '''select id, title, logo, type, credit_score, num_products, good_rating, num_collects from ataobao2.shop where token(id)>=:start and token(id)<:end''', dict(start=start, end=end), consistency_level='ONE') for row in cur: shopid, name, logo, type, credit_score, num_products, good_rating, num_collects = row shopids.add(shopid) try: aggregate_shop(si, ci, shopid, name, logo, type, credit_score, num_products, good_rating, num_collects) except: traceback.print_exc() si.allshopids.add(*shopids) si.execute() ci.execute() except: traceback.print_exc()
def aggregate_categories(date=None): if date is None: date = defaultdate ci = CategoryIndex(date) si = ShopIndex(date) ci.multi() cates = list(l1l2s) for cate1, cate2 in cates: info = {} if cate2 != 'all': r = getdb().execute('select search_index from ataobao2.cate where id=:id', dict(id=cate2), result=True) if r and r.results: info['search_index'] = r.results[0][0] else: info['search_index'] = 0 for mod in ['mon', 'day']: info.update({ 'shops': si.getshops(cate1, cate2), 'brands': ci.getbrands(cate1, cate2), }) info.update(ci.getinfo(cate1, cate2, mod)) print cate1, cate2, mod, info for field in ['deals', 'items', 'sales', 'delta_sales']: if field not in info: info[field] = 0 ci.setinfo(cate1, cate2, mod, info) ci.setindex(cate1, cate2, 'sales', mod, info.get('sales', 0)) ci.execute()
def aggregate_shops(start, end, date=None): try: db = getdb() if date is None: date = defaultdate si = ShopIndex(date) ci = CategoryIndex(date) si.multi() ci.multi() shopids = set() with db.connection() as cur: cur.execute('''select id, title, logo, type, credit_score, num_products, good_rating, num_collects from ataobao2.shop where token(id)>=:start and token(id)<:end''', dict(start=start, end=end), consistency_level='ONE') for row in cur: shopid, name, logo, type, credit_score, num_products, good_rating, num_collects = row shopids.add(shopid) try: aggregate_shop(si, ci, shopid, name, logo, type, credit_score, num_products, good_rating, num_collects) except: traceback.print_exc() si.allshopids.add(*shopids) si.execute() ci.execute() except: traceback.print_exc()
def aggregate_brands(date, *brands): ci = CategoryIndex(date) bi = BrandIndex(date) try: ci.multi() bi.multi() for brand in brands: aggregate_brand(bi, ci, date, brand) ci.execute() bi.execute() except: traceback.print_exc()
def aggregate_items(start, end, hosts=[], date=None, retry=0): if retry >= 20: raise Exception('retry too many times, give up') if start > end: aggregate_items(start, 2**63-1, hosts, date, retry) aggregate_items(-2**63, end, hosts, date, retry) try: db = getdb() if date is None: date = defaultdate datestr = date date2 = datetime.strptime(date, "%Y-%m-%d")+timedelta(hours=16) date1 = date2 - timedelta(days=60) si = ShopIndex(date) ii = ItemIndex(date) bi = BrandIndex(date) ci = CategoryIndex(date) si.multi() ii.multi() bi.multi() ci.multi() try: if hosts: d2 = calendar.timegm(date2.utctimetuple())*1000 d1 = calendar.timegm(date1.utctimetuple())*1000 host = hosts[0] conn = db.get_connection(host) cur = conn.cursor() cur.execute('''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type from ataobao2.item where token(id)>=:start and token(id)<:end''', dict(start=int(start), end=int(end))) iteminfos = list(cur) cur.execute('''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''', dict(start=int(start), end=int(end), date1=d1, date2=d2)) itemts = list(cur) conn.close() else: iteminfos = db.execute('''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type from ataobao2.item where token(id)>=:start and token(id)<:end''', dict(start=int(start), end=int(end)), result=True).results itemts = db.execute('''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''', dict(start=int(start), end=int(end), date1=d1, date2=d2), result=True).results except: print('cluster error on host {}, range {}, retry {}, sleeping 5 secs...'.format(hosts[0], (start, end), retry)) hosts = hosts[-1:] + hosts[:-1] #traceback.print_exc() time.sleep(30) return aggregate_items(start, end, date=date, hosts=hosts, retry=retry+1) itemtsdict = {} for row in itemts: itemid, date, values = row[0], row[1], list(row[2:]) # fix data malform # 1. num_colllects, index at 0, should not larger than 2**24 ~ 16 million if values[0] > 2**24: values[0] = 0 if isinstance(date, datetime): date = (date+timedelta(hours=8)).strftime("%Y-%m-%d") else: date = datetime.utcfromtimestamp(struct.unpack('!q', date)[0]/1000) date = (date+timedelta(hours=8)).strftime("%Y-%m-%d") if itemid not in itemtsdict: itemtsdict[itemid] = {} itemtsdict[itemid][date] = values for itemid, shopid, cid, nc, price, brand, name, image, nr, credit_score, title, type in iteminfos: if in_blacklist(shopid, price, cid, nc, nr, credit_score, title, type, itemid=itemid): #print itemid, 'skiped' continue brand = clean_brand(brand) if nc > 0 and itemid in itemtsdict and itemtsdict[itemid]: try: if shopid == 0: db.execute('delete from ataobao2.item where id=:id', dict(id=itemid)) db.execute('delete from ataobao2.item_by_date where id=:id', dict(id=itemid)) continue except: traceback.print_exc() try: aggregate_item(si, ii, bi, ci, itemid, itemtsdict[itemid], shopid, cid, price, brand, name, image, datestr) except: traceback.print_exc() si.execute() bi.execute() ci.execute() ii.execute() except: traceback.print_exc()
def aggregate_items(start, end, hosts=[], date=None, retry=0): if retry >= 20: raise Exception('retry too many times, give up') if start > end: aggregate_items(start, 2**63 - 1, hosts, date, retry) aggregate_items(-2**63, end, hosts, date, retry) try: db = getdb() if date is None: date = defaultdate datestr = date date2 = datetime.strptime(date, "%Y-%m-%d") + timedelta(hours=16) date1 = date2 - timedelta(days=60) si = ShopIndex(date) ii = ItemIndex(date) bi = BrandIndex(date) ci = CategoryIndex(date) si.multi() ii.multi() bi.multi() ci.multi() try: if hosts: d2 = calendar.timegm(date2.utctimetuple()) * 1000 d1 = calendar.timegm(date1.utctimetuple()) * 1000 host = hosts[0] conn = db.get_connection(host) cur = conn.cursor() cur.execute( '''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type from ataobao2.item where token(id)>=:start and token(id)<:end''', dict(start=int(start), end=int(end))) iteminfos = list(cur) cur.execute( '''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''', dict(start=int(start), end=int(end), date1=d1, date2=d2)) itemts = list(cur) conn.close() else: iteminfos = db.execute( '''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type from ataobao2.item where token(id)>=:start and token(id)<:end''', dict(start=int(start), end=int(end)), result=True).results itemts = db.execute( '''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''', dict(start=int(start), end=int(end), date1=d1, date2=d2), result=True).results except: print( 'cluster error on host {}, range {}, retry {}, sleeping 5 secs...' .format(hosts[0], (start, end), retry)) hosts = hosts[-1:] + hosts[:-1] #traceback.print_exc() time.sleep(30) return aggregate_items(start, end, date=date, hosts=hosts, retry=retry + 1) itemtsdict = {} for row in itemts: itemid, date, values = row[0], row[1], list(row[2:]) # fix data malform # 1. num_colllects, index at 0, should not larger than 2**24 ~ 16 million if values[0] > 2**24: values[0] = 0 if isinstance(date, datetime): date = (date + timedelta(hours=8)).strftime("%Y-%m-%d") else: date = datetime.utcfromtimestamp( struct.unpack('!q', date)[0] / 1000) date = (date + timedelta(hours=8)).strftime("%Y-%m-%d") if itemid not in itemtsdict: itemtsdict[itemid] = {} itemtsdict[itemid][date] = values for itemid, shopid, cid, nc, price, brand, name, image, nr, credit_score, title, type in iteminfos: if in_blacklist(shopid, price, cid, nc, nr, credit_score, title, type, itemid=itemid): #print itemid, 'skiped' continue brand = clean_brand(brand) if nc > 0 and itemid in itemtsdict and itemtsdict[itemid]: try: if shopid == 0: db.execute('delete from ataobao2.item where id=:id', dict(id=itemid)) db.execute( 'delete from ataobao2.item_by_date where id=:id', dict(id=itemid)) continue except: traceback.print_exc() try: aggregate_item(si, ii, bi, ci, itemid, itemtsdict[itemid], shopid, cid, price, brand, name, image, datestr) except: traceback.print_exc() si.execute() bi.execute() ci.execute() ii.execute() except: traceback.print_exc()