Exemplo n.º 1
0
def aggregate_categories(date=None):
    if date is None:
        date = defaultdate
    ci = CategoryIndex(date)
    si = ShopIndex(date)
    ci.multi()
    cates = list(l1l2s)
    for cate1, cate2 in cates:
        info = {}
        if cate2 != 'all':
            r = getdb().execute('select search_index from ataobao2.cate where id=:id', 
                                dict(id=cate2), result=True)
            if r and r.results:
                info['search_index'] = r.results[0][0]
            else:
                info['search_index'] = 0
        for mod in ['mon', 'day']:
            info.update({
                'shops': si.getshops(cate1, cate2),
                'brands': ci.getbrands(cate1, cate2),
            })
            info.update(ci.getinfo(cate1, cate2, mod))
            print cate1, cate2, mod, info

            for field in ['deals', 'items', 'sales', 'delta_sales']:
                if field not in info:
                    info[field] = 0
            ci.setinfo(cate1, cate2, mod, info)
            ci.setindex(cate1, cate2, 'sales', mod, info.get('sales', 0))

    ci.execute()
Exemplo n.º 2
0
def aggregate_shops(start, end, date=None):
    try:
        db = getdb()
        if date is None:
            date = defaultdate
        si = ShopIndex(date)
        ci = CategoryIndex(date)
        si.multi()
        ci.multi()
        shopids = set()
        with db.connection() as cur:
            cur.execute('''select id, title, logo, type, credit_score, num_products, good_rating, num_collects
                    from ataobao2.shop
                    where token(id)>=:start and token(id)<:end''',
                    dict(start=start, end=end), consistency_level='ONE')

            for row in cur:
                shopid, name, logo, type, credit_score, num_products, good_rating, num_collects = row
                shopids.add(shopid)
                try:
                    aggregate_shop(si, ci, shopid, name, logo, type, credit_score, num_products, good_rating, num_collects)
                except:
                    traceback.print_exc()
        si.allshopids.add(*shopids)
        si.execute()
        ci.execute()
    except:
        traceback.print_exc()
Exemplo n.º 3
0
def aggregate_brand(bi, ci, date, brand):
    db = getdb()
    if brand == '':
        brand = '无品牌'
    baseinfo = {}

    def update_with_cates(cate1, cate2):
        brandinfo = bi.getinfo(brand, cate1, cate2)
        sales = float(brandinfo.get('sales', 0))
        if brand != '无品牌':
            bi.setindex(brand, cate1, cate2, sales)

        categoryinfo = ci.getinfo(cate1, cate2, 'mon')
        try:
            share = sales/float(categoryinfo['sales'])
        except:
            share = 0

        num_shops = bi.getshops(brand, cate1, cate2) or 0
        bi.setinfo(brand, cate1, cate2, {'share':share, 'shops': num_shops})

        if cate2 == 'all':
            db.execute('''insert into ataobao2.brand_by_date (name, datestr, cate1, sales, share, num_shops)
                values (:name, :datestr, :cate1, :sales, :share, :num_shops)''',
                dict(name=brand.decode('utf-8'), datestr=date, cate1=cate1, sales=sales, share=share, num_shops=num_shops))


    # update info & index
    cates = bi.getcates(brand)
    for cate1, cate2 in cates:
        update_with_cates(cate1, cate2)
Exemplo n.º 4
0
def aggregate_shops(start, end, date=None):
    try:
        db = getdb()
        if date is None:
            date = defaultdate
        si = ShopIndex(date)
        ci = CategoryIndex(date)
        si.multi()
        ci.multi()
        shopids = set()
        with db.connection() as cur:
            cur.execute(
                '''select id, title, logo, type, credit_score, num_products, good_rating, num_collects
                    from ataobao2.shop
                    where token(id)>=:start and token(id)<:end''',
                dict(start=start, end=end),
                consistency_level='ONE')

            for row in cur:
                shopid, name, logo, type, credit_score, num_products, good_rating, num_collects = row
                shopids.add(shopid)
                try:
                    aggregate_shop(si, ci, shopid, name, logo, type,
                                   credit_score, num_products, good_rating,
                                   num_collects)
                except:
                    traceback.print_exc()
        si.allshopids.add(*shopids)
        si.execute()
        ci.execute()
    except:
        traceback.print_exc()
Exemplo n.º 5
0
def _getconn(date):
    conns = []
    db = getdb()
    r = db.execute('''select datestr, hosts, ready from ataobao2.agghosts
                      where datestr=:date''', dict(date=date), result=True)
    if not r.results:
        now = datetime.utcnow().date()
        dt = datetime.strptime(date, '%Y-%m-%d').date()
        dates = [(now-timedelta(days=days)).strftime('%Y-%m-%d') for days in range(14+(now-dt).days)]
        dates = '('+','.join(["'"+d+"'" for d in dates])+')'
        r = db.execute('''select datestr, hosts, ready from ataobao2.agghosts where datestr in {dates}'''.format(dates=dates),
                       result=True)
        alluris = [ [ds, hosts, ready] for ds, hosts, ready in sorted(r.results) if ready ]
        if not alluris:
            uris = AGGRE_URIS[0]
        else:
            used_uris = json.loads(alluris[-1][1])
            for au in AGGRE_URIS:
                if set(au) != set(used_uris):
                    uris = au
                    break
            else:
                uris = used_uris

        db.execute('''insert into ataobao2.agghosts (datestr, hosts)
                      values (:date, :hosts)''',  dict(date=date, hosts=json.dumps(uris)))
    else:
        uris = json.loads(r.results[0][1])

    for uri in uris:
        host, port, dbn = re.compile('redis://(.*):(\d+)/(\d+)').search(uri).groups()
        conn = redis.Redis(host=host, port=int(port), db=int(dbn))
        conns.append(conn)

    return ShardRedis(conns=conns)
Exemplo n.º 6
0
def es_shop(si, date, shopid):
    db = getdb()
    shopinfo = si.getbase(shopid)
    num_products = int(shopinfo.get('num_products', 0))
    credit_score = int(shopinfo.get('credit_score', 0)) or 1
    good_rating = shopinfo.get('good_rating', '')
    title = shopinfo.get('name', '')
    logo = shopinfo.get('logo', '')
    type = shopinfo.get('type', '')
    worth = float(shopinfo.get('worth', 0))
    cates = si.getcates(shopid)
    sales = 0
    deals = 0
    c1s = list(set([str(c[0]) for c in cates]))
    c2s = list(set([str(c[1]) for c in cates]))
    for c1 in c1s:
        info = si.getinfo(c1, 'all', 'mon', shopid)
        if info:
            sales += float(info.get('sales', 0))
            deals += int(info.get('deals', 0))

    items = si.gethotitems(shopid) or []
    hot_items = []
    if items and sales >= 10000 and credit_score >= 5:
        items = [int(id) for id in items]
        if len(items) == 1:
            r = db.execute(
                'select id, image, num_sold30 from ataobao2.item where id=:id',
                dict(id=items[0]),
                result=True)
        else:
            r = db.execute(
                'select id, image, num_sold30 from ataobao2.item where id in :ids',
                dict(ids=tuple(items[:4])),
                result=True)
        for row in r.results:
            itemid, image, num_sold30 = row
            hot_items.append({
                'itemid': itemid,
                'deals': num_sold30,
                'image': image
            })

    info = {
        'title': title,
        'logo': logo,
        'cate1': c1s,
        'cate2': c2s,
        'worth': worth,
        'sales': sales,
        'good_rating': good_rating,
        'type': type,
        'credit_score': credit_score,
        'num_products': num_products,
        'average_price': sales / deals if deals != 0 else 0,
        'hot_items': json.dumps(hot_items),
    }

    index_shop(int(shopid), info)
Exemplo n.º 7
0
 def generate_tasks(self):
     self.clear_redis()
     conn = getdb().get()
     tclient = conn.client
     ring = tclient.describe_ring('ataobao2')
     conn.close()
     tokens = len(ring)
     tasks = defaultdict(list)
     v264 = 2**64
     v263_1 = 2**63 - 1
     step = v264 // self.step
     #slicepertoken = self.step/tokens
     for tokenrange in ring:
         ostart = int(tokenrange.start_token)
         oend = int(tokenrange.end_token)
         slicepertoken = (oend - ostart) // step if ostart < oend else (
             v264 + oend - ostart) // step
         #step = (oend - ostart) // slicepertoken if ostart < oend else (v264+oend - ostart) // slicepertoken
         hosts = tokenrange.endpoints
         for i in range(slicepertoken):
             start = ostart + step * i
             end = start + step
             if start > v263_1:
                 start -= v264
             if end > v263_1:
                 end -= v264
             tasks[hosts[0]].append([
                 'aggregator.itemagg.aggregate_items', (start, end),
                 dict(date=self.date, hosts=hosts)
             ])
         start = ostart + slicepertoken * step
         end = oend
         tasks[hosts[0]].append([
             'aggregator.itemagg.aggregate_items', (start, end),
             dict(date=self.date, hosts=hosts)
         ])
     universe_tasks = []
     # averaging tasks
     #for _ in range(40):
     #    lenhosts = sorted([[len(tasks[host]), host] for host in tasks])
     #    delta = (lenhosts[-1][0] - lenhosts[0][0]) // 2
     #    if delta > 0 and len(tasks) > 1:
     #        print 'inequality index', delta
     #        for task in tasks[lenhosts[-1][1]][-delta:]:
     #            task[2]['hosts'] = task[2]['hosts'][-1:] + task[2]['hosts'][:-1]
     #            tasks[task[2]['hosts'][0]].insert(0, task)
     #        tasks[lenhosts[-1][1]] = tasks[lenhosts[-1][1]][:-delta]
     while sum(map(len, tasks.itervalues())):
         for host in tasks:
             if tasks[host]:
                 task = tasks[host].pop()
                 universe_tasks.append(task)
     self.add_tasks(*universe_tasks)
     self.finish_generation()
Exemplo n.º 8
0
def save_iteminfo(date, ii, itemid, retry=0):
    db = getdb()
    date2 = datetime.strptime(date, "%Y-%m-%d") + timedelta(hours=16)
    date1 = date2 - timedelta(days=60)
    r1 = db.execute(
        '''select title, image, shopid, brand, price, num_sold30, cid
                from ataobao2.item where id=:itemid''',
        dict(itemid=itemid),
        result=True)
    r2 = db.execute(
        '''select date, num_collects, num_reviews, num_sold30, num_views, price
                    from ataobao2.item_by_date
                    where id=:itemid and date>=:date1 and date<:date2''',
        dict(itemid=itemid, date1=date1, date2=date2),
        result=True)

    items = {(r[0] + timedelta(hours=8)).strftime("%Y-%m-%d"): r[1:]
             for r in r2.results}

    if not r1.results:
        from queues import ai1
        ai1.put(itemid)
        return

    if retry < 10 and items == {}:
        print '....retry', retry + 1
        return save_iteminfo(date, ii, itemid, retry + 1)

    if r1.results:
        name, image, shopid, brand, price, deals_mon, cid = r1.results[0]
        info = parse_iteminfo(date, itemid, items, price, cid)
        if info is None:
            print 'no result from parse_iteminfo', date, itemid, items, price, cid
            return

        cate1 = info['l1']
        for cate2 in set([info['l2'], 'all']):
            ii.setinfo(itemid, {
                'name': name,
                'image': image,
                'shopid': shopid,
                'brand': brand,
                'price': info['price'],
                'sales_day': info['sales_day'],
                'sales_mon': info['sales_mon'],
                'deals_day': info['deals_day'],
                'deals_mon': info['deals_mon'],
                'delta_sales_mon': info['delta_sales_mon'],
                'delta_sales_day': info['delta_sales_day'],
            },
                       skey='{}_{}'.format(cate1, cate2))
Exemplo n.º 9
0
def es_shop(si, date, shopid):
    db = getdb()
    shopinfo = si.getbase(shopid)
    num_products = int(shopinfo.get('num_products', 0))
    credit_score = int(shopinfo.get('credit_score', 0)) or 1
    good_rating = shopinfo.get('good_rating', '')
    title = shopinfo.get('name', '')
    logo = shopinfo.get('logo', '')
    type = shopinfo.get('type', '')
    worth = float(shopinfo.get('worth', 0))
    cates = si.getcates(shopid)
    sales = 0
    deals = 0
    c1s = list(set([str(c[0]) for c in cates]))
    c2s = list(set([str(c[1]) for c in cates]))
    for c1 in c1s:
        info = si.getinfo(c1, 'all', 'mon', shopid)
        if info:
            sales += float(info.get('sales', 0))
            deals += int(info.get('deals', 0))

    items = si.gethotitems(shopid) or []
    hot_items = []
    if items and sales >= 10000 and credit_score >= 5:
        items = [int(id) for id in items]
        if len(items) == 1:
            r = db.execute('select id, image, num_sold30 from ataobao2.item where id=:id', dict(id=items[0]), result=True)
        else:
            r = db.execute('select id, image, num_sold30 from ataobao2.item where id in :ids', dict(ids=tuple(items[:4])), result=True)
        for row in r.results:
            itemid, image, num_sold30 = row
            hot_items.append({'itemid':itemid, 'deals':num_sold30, 'image':image})

    info = {
        'title': title,
        'logo': logo,
        'cate1': c1s,
        'cate2': c2s,
        'worth': worth,
        'sales': sales,
        'good_rating': good_rating,
        'type': type,
        'credit_score': credit_score,
        'num_products': num_products,
        'average_price': sales/deals if deals !=0 else 0,
        'hot_items': json.dumps(hot_items),
    }

    index_shop(int(shopid), info)
Exemplo n.º 10
0
 def generate_tasks(self):
     self.clear_redis()
     conn = getdb().get()
     tclient = conn.client
     ring = tclient.describe_ring('ataobao2')
     conn.close()
     tokens = len(ring)
     tasks = defaultdict(list)
     v264 = 2**64
     v263_1 = 2**63-1
     step = v264 // self.step
     #slicepertoken = self.step/tokens
     for tokenrange in ring:
         ostart = int(tokenrange.start_token)
         oend = int(tokenrange.end_token)
         slicepertoken = (oend - ostart) // step if ostart < oend else (v264+oend - ostart) // step
         #step = (oend - ostart) // slicepertoken if ostart < oend else (v264+oend - ostart) // slicepertoken
         hosts = tokenrange.endpoints
         for i in range(slicepertoken):
             start = ostart + step * i
             end = start + step
             if start > v263_1:
                 start -= v264
             if end > v263_1:
                 end -= v264
             tasks[hosts[0]].append(['aggregator.itemagg.aggregate_items', (start, end), dict(date=self.date, hosts=hosts)])
         start = ostart + slicepertoken*step
         end = oend
         tasks[hosts[0]].append(['aggregator.itemagg.aggregate_items', (start, end), dict(date=self.date, hosts=hosts)])
     universe_tasks = []
     # averaging tasks
     #for _ in range(40):
     #    lenhosts = sorted([[len(tasks[host]), host] for host in tasks])
     #    delta = (lenhosts[-1][0] - lenhosts[0][0]) // 2
     #    if delta > 0 and len(tasks) > 1:
     #        print 'inequality index', delta
     #        for task in tasks[lenhosts[-1][1]][-delta:]:
     #            task[2]['hosts'] = task[2]['hosts'][-1:] + task[2]['hosts'][:-1]
     #            tasks[task[2]['hosts'][0]].insert(0, task)
     #        tasks[lenhosts[-1][1]] = tasks[lenhosts[-1][1]][:-delta]
     while sum(map(len, tasks.itervalues())):
         for host in tasks:
             if tasks[host]:
                 task = tasks[host].pop()
                 universe_tasks.append(task)
     self.add_tasks(*universe_tasks)
     self.finish_generation()
Exemplo n.º 11
0
def es_brand(bi, date, brand):
    db = getdb()
    if brand == '':
        brand = '无品牌'

    d0 = (datetime.strptime(date, '%Y-%m-%d') -
          timedelta(days=1)).strftime('%Y-%m-%d')
    cates = bi.getcates(brand)
    c1s = list(set([c[0] for c in cates]))
    c2s = list(set([c[1] for c in cates]))
    shops = items = deals = sales = delta = 0
    for c1 in c1s:
        brandinfo = bi.getinfo(brand, c1, 'all')
        shops += int(brandinfo.get('shops', 0))
        items += int(brandinfo.get('items', 0))
        deals += int(brandinfo.get('deals', 0))
        sales += float(brandinfo.get('sales', 0))
        delta += float(brandinfo.get('delta_sales', 0))

    # we don't care about brands that do not have more than 100 items
    if items < 100:
        return

    cate1 = [str(c) for c in c1s]
    cate2 = [str(c) for c in c2s]
    r = db.execute('select logo from ataobao2.brand where name=:name',
                   dict(name=brand),
                   result=True)
    try:
        logo = r.results[0][0]
    except:
        logo = ''

    info = {
        'title': brand,
        'cate1': cate1,
        'cate2': cate2,
        'logo': logo,
        'shops': shops,
        'items': items,
        'deals': deals,
        'sales': sales,
        'delta': delta,
    }

    index_brand(brand, info)
Exemplo n.º 12
0
def save_iteminfo(date, ii, itemid, retry=0):
    db = getdb()
    date2 = datetime.strptime(date, "%Y-%m-%d")+timedelta(hours=16)
    date1 = date2 - timedelta(days=60)
    r1 = db.execute('''select title, image, shopid, brand, price, num_sold30, cid
                from ataobao2.item where id=:itemid''',
                dict(itemid=itemid), result=True)
    r2 = db.execute('''select date, num_collects, num_reviews, num_sold30, num_views, price
                    from ataobao2.item_by_date
                    where id=:itemid and date>=:date1 and date<:date2''',
                    dict(itemid=itemid, date1=date1, date2=date2), result=True)

    items = {(r[0]+timedelta(hours=8)).strftime("%Y-%m-%d"): r[1:]
                for r in r2.results}
        
    if not r1.results:
        from queues import ai1
        ai1.put(itemid)
        return

    if retry < 10 and items == {}:
        print '....retry', retry+1
        return save_iteminfo(date, ii, itemid, retry+1)

    if r1.results:
        name, image, shopid, brand, price, deals_mon, cid = r1.results[0]
        info = parse_iteminfo(date, itemid, items, price, cid)
        if info is None:
            print 'no result from parse_iteminfo', date, itemid, items, price, cid
            return

        cate1 = info['l1']
        for cate2 in set([info['l2'], 'all']):
            ii.setinfo(itemid, {
                'name': name,
                'image': image,
                'shopid': shopid,
                'brand': brand,
                'price': info['price'],
                'sales_day': info['sales_day'],
                'sales_mon': info['sales_mon'],
                'deals_day': info['deals_day'],
                'deals_mon': info['deals_mon'],
                'delta_sales_mon': info['delta_sales_mon'],
                'delta_sales_day': info['delta_sales_day'],
            }, skey='{}_{}'.format(cate1, cate2))
Exemplo n.º 13
0
def _getconn(date):
    conns = []
    db = getdb()
    r = db.execute('''select datestr, hosts, ready from ataobao2.agghosts
                      where datestr=:date''',
                   dict(date=date),
                   result=True)
    if not r.results:
        now = datetime.utcnow().date()
        dt = datetime.strptime(date, '%Y-%m-%d').date()
        dates = [(now - timedelta(days=days)).strftime('%Y-%m-%d')
                 for days in range(14 + (now - dt).days)]
        dates = '(' + ','.join(["'" + d + "'" for d in dates]) + ')'
        r = db.execute(
            '''select datestr, hosts, ready from ataobao2.agghosts where datestr in {dates}'''
            .format(dates=dates),
            result=True)
        alluris = [[ds, hosts, ready] for ds, hosts, ready in sorted(r.results)
                   if ready]
        if not alluris:
            uris = AGGRE_URIS[0]
        else:
            used_uris = json.loads(alluris[-1][1])
            for au in AGGRE_URIS:
                if set(au) != set(used_uris):
                    uris = au
                    break
            else:
                uris = used_uris

        db.execute(
            '''insert into ataobao2.agghosts (datestr, hosts)
                      values (:date, :hosts)''',
            dict(date=date, hosts=json.dumps(uris)))
    else:
        uris = json.loads(r.results[0][1])

    for uri in uris:
        host, port, dbn = re.compile('redis://(.*):(\d+)/(\d+)').search(
            uri).groups()
        conn = redis.Redis(host=host, port=int(port), db=int(dbn))
        conns.append(conn)

    return ShardRedis(conns=conns)
Exemplo n.º 14
0
def es_brand(bi, date, brand):
    db = getdb()
    if brand == "":
        brand = "无品牌"

    d0 = (datetime.strptime(date, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d")
    cates = bi.getcates(brand)
    c1s = list(set([c[0] for c in cates]))
    c2s = list(set([c[1] for c in cates]))
    shops = items = deals = sales = delta = 0
    for c1 in c1s:
        brandinfo = bi.getinfo(brand, c1, "all")
        shops += int(brandinfo.get("shops", 0))
        items += int(brandinfo.get("items", 0))
        deals += int(brandinfo.get("deals", 0))
        sales += float(brandinfo.get("sales", 0))
        delta += float(brandinfo.get("delta_sales", 0))

    # we don't care about brands that do not have more than 100 items
    if items < 100:
        return

    cate1 = [str(c) for c in c1s]
    cate2 = [str(c) for c in c2s]
    r = db.execute("select logo from ataobao2.brand where name=:name", dict(name=brand), result=True)
    try:
        logo = r.results[0][0]
    except:
        logo = ""

    info = {
        "title": brand,
        "cate1": cate1,
        "cate2": cate2,
        "logo": logo,
        "shops": shops,
        "items": items,
        "deals": deals,
        "sales": sales,
        "delta": delta,
    }

    index_brand(brand, info)
Exemplo n.º 15
0
def clearall(date):
    db = getdb('db1')
    for p in all_processes:
        p.clear_redis()
        p.reset()

    db.execute('delete from ataobao2.agghosts where datestr=:date', dict(date=date))
    clear_date(date)

    r = db.execute('select datestr, ready from ataobao2.agghosts', result=True)
    ahs = sorted(r.results) 
    try:
        last = [ x[0] for x in ahs if x[1] ][-1]
    except:
        last = date
    for d, ready in ahs:
        if d < last:
            db.execute('delete from ataobao2.agghosts where datestr=:date', dict(date=d))

    db.execute('delete from ataobao2.blacklist where type=\'shopblacknew\';')
Exemplo n.º 16
0
def clearall(date):
    db = getdb('db1')
    for p in all_processes:
        p.clear_redis()
        p.reset()

    db.execute('delete from ataobao2.agghosts where datestr=:date',
               dict(date=date))
    clear_date(date)

    r = db.execute('select datestr, ready from ataobao2.agghosts', result=True)
    ahs = sorted(r.results)
    try:
        last = [x[0] for x in ahs if x[1]][-1]
    except:
        last = date
    for d, ready in ahs:
        if d < last:
            db.execute('delete from ataobao2.agghosts where datestr=:date',
                       dict(date=d))

    db.execute('delete from ataobao2.blacklist where type=\'shopblacknew\';')
Exemplo n.º 17
0
def aggregate_brand(bi, ci, date, brand):
    db = getdb()
    if brand == '':
        brand = '无品牌'
    baseinfo = {}

    def update_with_cates(cate1, cate2):
        brandinfo = bi.getinfo(brand, cate1, cate2)
        sales = float(brandinfo.get('sales', 0))
        if brand != '无品牌':
            bi.setindex(brand, cate1, cate2, sales)

        categoryinfo = ci.getinfo(cate1, cate2, 'mon')
        try:
            share = sales / float(categoryinfo['sales'])
        except:
            share = 0

        num_shops = bi.getshops(brand, cate1, cate2) or 0
        bi.setinfo(brand, cate1, cate2, {'share': share, 'shops': num_shops})

        if cate2 == 'all':
            db.execute(
                '''insert into ataobao2.brand_by_date (name, datestr, cate1, sales, share, num_shops)
                values (:name, :datestr, :cate1, :sales, :share, :num_shops)''',
                dict(name=brand.decode('utf-8'),
                     datestr=date,
                     cate1=cate1,
                     sales=sales,
                     share=share,
                     num_shops=num_shops))

    # update info & index
    cates = bi.getcates(brand)
    for cate1, cate2 in cates:
        update_with_cates(cate1, cate2)
Exemplo n.º 18
0
def save_history_shop(si, date, shopid):
    db = getdb()
    shopinfo = si.getbase(shopid)
    if shopinfo:
        num_collects = int(shopinfo.get('num_collects', 0))
        worth = float(shopinfo.get('worth', 0))
        sales = 0
        catetrend = {}
        cates = si.getcates(shopid)
        c1s = list(set([c[0] for c in cates]))
        for c1 in c1s:
            catetrend[c1] = {'rank': si.getrank(c1, 'all', shopid, 'day')}
            info = si.getinfo(c1, 'all', 'day', shopid)
            if info:
                catetrend[c1]['sales'] = float(info.get('sales', 0))
                sales += catetrend[c1]['sales']
                catetrend[c1]['deals'] = int(info.get('deals', 0))
        brandshare = {}

        for mod in ['mon', 'day']:
            brandshare[mod] = {}
            info = si.getbrandinfo(shopid, 'sales', mod)
            dealsinfo = si.getbrandinfo(shopid, 'deals', mod)
            binfo = [(brand, float(value), int(dealsinfo.get(brand, 0)))
                     for brand, value in info.iteritems() if float(value) > 0]
            total_sales = sum(sales for brand, sales, deals in binfo)
            if total_sales == 0:
                top10 = []
            else:
                tops = sorted(binfo, key=itemgetter(1), reverse=True)
                other_sales = sum(sales for brand, sales, deals in tops[9:])
                other_deals = sum(deals for brand, sales, deals in tops[9:])
                tops = [(brand.decode('utf-8'),
                         '{:4.2f}%'.format(sales * 100 / total_sales), sales,
                         deals) for brand, sales, deals in tops[:9]]
                if other_sales > 0:
                    tops.append(
                        (u'其他',
                         '{:4.2f}%'.format(other_sales * 100 / total_sales),
                         other_sales, other_deals))
                top10 = tops

            brandshare[mod] = top10

        cateshare = {}
        for mod in ['mon', 'day']:
            cinfo = []
            total_sales = 0
            for cate1, cate2 in cates:
                info = si.getinfo(cate1, cate2, mod, shopid)
                if info and float(info.get('sales', 0)) > 0:
                    total_sales += float(info.get('sales', 0))
                    cinfo.append(
                        (cate2, float(info.get('sales',
                                               0)), int(info.get('deals', 0))))
            if total_sales == 0:
                top10 = []
            else:
                tops = sorted(cinfo, key=itemgetter(1), reverse=True)
                other_sales = sum(sales for cate2, sales, deals in tops[9:])
                other_deals = sum(deals for cate2, sales, deals in tops[9:])
                tops = [(cate2, '{:4.2f}%'.format(sales * 100 / total_sales),
                         sales, deals) for cate2, sales, deals in tops[:9]]
                if other_sales > 0:
                    tops.append(
                        (u'其他',
                         '{:4.2f}'.format(other_sales * 100 / total_sales),
                         other_sales, other_deals))
                top10 = tops
            cateshare[mod] = top10

        db.execute(
            '''insert into ataobao2.shop_by_date 
                    (id, datestr, worth, sales, num_collects, catetrend, brandshare, cateshare) values
                    (:shopid, :datestr, :worth, :sales, :num_collects, :catetrend, :brandshare, :cateshare)''',
            dict(worth=worth,
                 num_collects=num_collects,
                 shopid=shopid,
                 datestr=date,
                 sales=total_sales,
                 catetrend=json.dumps(catetrend),
                 brandshare=json.dumps(brandshare),
                 cateshare=json.dumps(cateshare)))
Exemplo n.º 19
0
def aggregate_items(start, end, hosts=[], date=None, retry=0):
    if retry >= 20:
        raise Exception('retry too many times, give up')

    if start > end:
        aggregate_items(start, 2**63 - 1, hosts, date, retry)
        aggregate_items(-2**63, end, hosts, date, retry)

    try:
        db = getdb()
        if date is None:
            date = defaultdate
        datestr = date
        date2 = datetime.strptime(date, "%Y-%m-%d") + timedelta(hours=16)
        date1 = date2 - timedelta(days=60)
        si = ShopIndex(date)
        ii = ItemIndex(date)
        bi = BrandIndex(date)
        ci = CategoryIndex(date)
        si.multi()
        ii.multi()
        bi.multi()
        ci.multi()

        try:
            if hosts:
                d2 = calendar.timegm(date2.utctimetuple()) * 1000
                d1 = calendar.timegm(date1.utctimetuple()) * 1000
                host = hosts[0]
                conn = db.get_connection(host)
                cur = conn.cursor()
                cur.execute(
                    '''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type
                    from ataobao2.item where token(id)>=:start and token(id)<:end''',
                    dict(start=int(start), end=int(end)))
                iteminfos = list(cur)
                cur.execute(
                    '''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date 
                    where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''',
                    dict(start=int(start), end=int(end), date1=d1, date2=d2))
                itemts = list(cur)
                conn.close()
            else:
                iteminfos = db.execute(
                    '''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type
                    from ataobao2.item where token(id)>=:start and token(id)<:end''',
                    dict(start=int(start), end=int(end)),
                    result=True).results
                itemts = db.execute(
                    '''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date 
                    where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''',
                    dict(start=int(start), end=int(end), date1=d1, date2=d2),
                    result=True).results
        except:
            print(
                'cluster error on host {}, range {}, retry {}, sleeping 5 secs...'
                .format(hosts[0], (start, end), retry))
            hosts = hosts[-1:] + hosts[:-1]
            #traceback.print_exc()
            time.sleep(30)
            return aggregate_items(start,
                                   end,
                                   date=date,
                                   hosts=hosts,
                                   retry=retry + 1)

        itemtsdict = {}
        for row in itemts:
            itemid, date, values = row[0], row[1], list(row[2:])
            # fix data malform
            # 1. num_colllects, index at 0, should not larger than 2**24 ~ 16 million
            if values[0] > 2**24:
                values[0] = 0
            if isinstance(date, datetime):
                date = (date + timedelta(hours=8)).strftime("%Y-%m-%d")
            else:
                date = datetime.utcfromtimestamp(
                    struct.unpack('!q', date)[0] / 1000)
                date = (date + timedelta(hours=8)).strftime("%Y-%m-%d")
            if itemid not in itemtsdict:
                itemtsdict[itemid] = {}
            itemtsdict[itemid][date] = values

        for itemid, shopid, cid, nc, price, brand, name, image, nr, credit_score, title, type in iteminfos:
            if in_blacklist(shopid,
                            price,
                            cid,
                            nc,
                            nr,
                            credit_score,
                            title,
                            type,
                            itemid=itemid):
                #print itemid, 'skiped'
                continue
            brand = clean_brand(brand)
            if nc > 0 and itemid in itemtsdict and itemtsdict[itemid]:
                try:
                    if shopid == 0:
                        db.execute('delete from ataobao2.item where id=:id',
                                   dict(id=itemid))
                        db.execute(
                            'delete from ataobao2.item_by_date where id=:id',
                            dict(id=itemid))
                        continue
                except:
                    traceback.print_exc()
                try:
                    aggregate_item(si, ii, bi, ci, itemid, itemtsdict[itemid],
                                   shopid, cid, price, brand, name, image,
                                   datestr)
                except:
                    traceback.print_exc()

        si.execute()
        bi.execute()
        ci.execute()
        ii.execute()
    except:
        traceback.print_exc()
Exemplo n.º 20
0
def aggregate_items(start, end, hosts=[], date=None, retry=0):
    if retry >= 20:
        raise Exception('retry too many times, give up')

    if start > end:
        aggregate_items(start, 2**63-1, hosts, date, retry)
        aggregate_items(-2**63, end, hosts, date, retry)

    try:
        db = getdb()
        if date is None:
            date = defaultdate
        datestr = date
        date2 = datetime.strptime(date, "%Y-%m-%d")+timedelta(hours=16)
        date1 = date2 - timedelta(days=60)
        si = ShopIndex(date)
        ii = ItemIndex(date)
        bi = BrandIndex(date)
        ci = CategoryIndex(date)
        si.multi()
        ii.multi()
        bi.multi()
        ci.multi()

        try:
            if hosts:
                d2 = calendar.timegm(date2.utctimetuple())*1000
                d1 = calendar.timegm(date1.utctimetuple())*1000
                host = hosts[0]
                conn = db.get_connection(host)
                cur = conn.cursor()
                cur.execute('''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type
                    from ataobao2.item where token(id)>=:start and token(id)<:end''',
                    dict(start=int(start), end=int(end)))
                iteminfos = list(cur)
                cur.execute('''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date 
                    where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''',
                    dict(start=int(start), end=int(end), date1=d1, date2=d2))
                itemts = list(cur)
                conn.close()
            else:
                iteminfos = db.execute('''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type
                    from ataobao2.item where token(id)>=:start and token(id)<:end''',
                    dict(start=int(start), end=int(end)), result=True).results
                itemts = db.execute('''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date 
                    where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''',
                    dict(start=int(start), end=int(end), date1=d1, date2=d2), result=True).results
        except:
            print('cluster error on host {}, range {}, retry {}, sleeping 5 secs...'.format(hosts[0], (start, end), retry))
            hosts = hosts[-1:] + hosts[:-1]
            #traceback.print_exc()
            time.sleep(30)
            return aggregate_items(start, end, date=date, hosts=hosts, retry=retry+1)


        itemtsdict = {}
        for row in itemts:
            itemid, date, values = row[0], row[1], list(row[2:])
            # fix data malform
            # 1. num_colllects, index at 0, should not larger than 2**24 ~ 16 million
            if values[0] > 2**24:
                values[0] = 0
            if isinstance(date, datetime):
                date = (date+timedelta(hours=8)).strftime("%Y-%m-%d")
            else:
                date = datetime.utcfromtimestamp(struct.unpack('!q', date)[0]/1000)
                date = (date+timedelta(hours=8)).strftime("%Y-%m-%d")
            if itemid not in itemtsdict:
                itemtsdict[itemid] = {}
            itemtsdict[itemid][date] = values


        for itemid, shopid, cid, nc, price, brand, name, image, nr, credit_score, title, type in iteminfos:
            if in_blacklist(shopid, price, cid, nc, nr, credit_score, title, type, itemid=itemid):
                #print itemid, 'skiped'
                continue
            brand = clean_brand(brand)
            if nc > 0 and itemid in itemtsdict and itemtsdict[itemid]:
                try:
                    if shopid == 0:
                        db.execute('delete from ataobao2.item where id=:id', dict(id=itemid))
                        db.execute('delete from ataobao2.item_by_date where id=:id', dict(id=itemid))
                        continue
                except:
                    traceback.print_exc()
                try:
                    aggregate_item(si, ii, bi, ci, itemid, itemtsdict[itemid], shopid, cid, price, brand, name, image, datestr)
                except:
                    traceback.print_exc()

        si.execute()
        bi.execute()
        ci.execute()
        ii.execute()
    except:
        traceback.print_exc()
Exemplo n.º 21
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" Blacklist Item from our Aggregation Process """
from aggregator.models import getdb
import threading

lock = threading.Lock()

db = getdb()

bl_shopblackids = None
bl_shopwhiteids = None
bl_thresholds = None

def load_bls():
    global bl_shopblackids
    global bl_shopwhiteids
    global bl_thresholds
    with lock:
        if bl_shopblackids is None:
            print 'loading blacklist'
            bls = set([row for row in db.execute('select type, args, value from ataobao2.blacklist', result=True).results ])
            bl_shopblackids = set(int(row[1]) for row in bls if row[0] == 'shopblack')
            bl_thresholds = {row[1]:float(row[2]) for row in bls if row[0] == 'cateprice'}
            bl_shopwhiteids = set(int(row[1]) for row in bls if row[0] == 'shopwhite')
            print 'blacklist loaded'

def get_l1_and_l2(cid):
    from crawler.cates import cates
    if cid in cates:
        cidchain = []
Exemplo n.º 22
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from aggregator.models import getdb
from aggregator.indexes import ShopIndex, ItemIndex, BrandIndex, CategoryIndex
from aggregator.processes import Process

from datetime import datetime, timedelta
from collections import Counter

from crawler.cates import l1l2s, topcids

import json
import traceback

defaultdate = (datetime.utcnow()+timedelta(hours=-16)).strftime("%Y-%m-%d")
db = getdb()

def top10_brands(date=None):
    if date is None:
        date = defaultdate
    bi = BrandIndex(date)
    top10 = Counter()
    branddeals = Counter()
    for cate1 in topcids:
        for brand, sales in bi.getindex(cate1, 'all'):
            if brand == '无品牌':
                continue
            info = bi.getinfo(brand, cate1, 'all')
            branddeals[brand] += int(info.get('deals', 0))
            top10[brand] += float(sales)
    top10 = top10.most_common(10)
Exemplo n.º 23
0
def save_history_shop(si, date, shopid):
    db = getdb()
    shopinfo = si.getbase(shopid)
    if shopinfo:
        num_collects = int(shopinfo.get('num_collects', 0))
        worth = float(shopinfo.get('worth', 0))
        sales = 0
        catetrend = {}
        cates = si.getcates(shopid)
        c1s = list(set([c[0] for c in cates]))
        for c1 in c1s:
            catetrend[c1] = {'rank': si.getrank(c1, 'all', shopid, 'day')}
            info = si.getinfo(c1, 'all', 'day', shopid)
            if info:
                catetrend[c1]['sales'] = float(info.get('sales', 0))
                sales += catetrend[c1]['sales']
                catetrend[c1]['deals'] = int(info.get('deals', 0))
        brandshare = {}

        for mod in ['mon', 'day']:
            brandshare[mod] = {}
            info = si.getbrandinfo(shopid, 'sales', mod)
            dealsinfo = si.getbrandinfo(shopid, 'deals', mod)
            binfo = [(brand, float(value), int(dealsinfo.get(brand, 0)) )
                        for brand, value in info.iteritems() if float(value)>0]
            total_sales = sum(sales for brand, sales, deals in binfo)
            if total_sales == 0:
                top10 = []
            else:
                tops = sorted(binfo, key=itemgetter(1), reverse=True)
                other_sales = sum(sales for brand, sales, deals in tops[9:])
                other_deals = sum(deals for brand, sales, deals in tops[9:])
                tops = [(brand.decode('utf-8'), '{:4.2f}%'.format(sales*100/total_sales), sales, deals)
                                for brand, sales, deals in tops[:9] ]
                if other_sales > 0:
                    tops.append((u'其他', '{:4.2f}%'.format(other_sales*100/total_sales), other_sales, other_deals))
                top10 = tops

            brandshare[mod] = top10

        cateshare = {}
        for mod in ['mon', 'day']:
            cinfo = []
            total_sales = 0
            for cate1, cate2 in cates:
                info = si.getinfo(cate1, cate2, mod, shopid)
                if info and float(info.get('sales', 0)) > 0:
                    total_sales += float(info.get('sales', 0))
                    cinfo.append((cate2, float(info.get('sales', 0)), int(info.get('deals', 0))))
            if total_sales == 0:
                top10 = []
            else:
                tops = sorted(cinfo, key=itemgetter(1), reverse=True)
                other_sales = sum(sales for cate2, sales, deals in tops[9:])
                other_deals = sum(deals for cate2, sales, deals in tops[9:])
                tops = [(cate2, '{:4.2f}%'.format(sales*100/total_sales), sales, deals)
                            for cate2, sales, deals in tops[:9]]
                if other_sales > 0:
                    tops.append((u'其他', '{:4.2f}'.format(other_sales*100/total_sales), other_sales, other_deals))
                top10 = tops
            cateshare[mod] = top10

        db.execute('''insert into ataobao2.shop_by_date 
                    (id, datestr, worth, sales, num_collects, catetrend, brandshare, cateshare) values
                    (:shopid, :datestr, :worth, :sales, :num_collects, :catetrend, :brandshare, :cateshare)''',
                    dict(worth=worth, num_collects=num_collects, shopid=shopid, datestr=date, sales=total_sales,
                        catetrend=json.dumps(catetrend), brandshare=json.dumps(brandshare), cateshare=json.dumps(cateshare)))
Exemplo n.º 24
0
def mark_ready(date):
    db = getdb('db1')
    db.execute(
        'insert into ataobao2.agghosts (datestr, ready) values (:date, true)',
        dict(date=date))
Exemplo n.º 25
0
def mark_ready(date):
    db = getdb('db1')
    db.execute('insert into ataobao2.agghosts (datestr, ready) values (:date, true)',
                dict(date=date))