示例#1
0
 def generate_tasks(self):
     self.clear_redis()
     ii = ItemIndex(self.date)
     allids = set()
     cates = ii.getcates()
     cates.extend(list(set([(c[0], 'all') for c in cates])))
     for cate1, cate2 in cates:
         ids1 = ii.gettopitemids(cate1, cate2, 'sales', 'mon')
         ids2 = ii.gettopitemids(cate1, cate2, 'sales', 'day')
         ids = (set(ids1) | set(ids2)) - allids
         allids.update(ids)
         if ids:
             self.add_task('aggregator.iteminfo.save_iteminfos', self.date, *ids)
     self.finish_generation()
示例#2
0
 def generate_tasks(self):
     self.clear_redis()
     ii = ItemIndex(self.date)
     allids = set()
     cates = ii.getcates()
     cates.extend(list(set([(c[0], 'all') for c in cates])))
     for cate1, cate2 in cates:
         ids1 = ii.gettopitemids(cate1, cate2, 'sales', 'mon')
         ids2 = ii.gettopitemids(cate1, cate2, 'sales', 'day')
         ids = (set(ids1) | set(ids2)) - allids
         allids.update(ids)
         if ids:
             self.add_task('aggregator.iteminfo.save_iteminfos', self.date,
                           *ids)
     self.finish_generation()
示例#3
0
def top10_items(date=None):
    if date is None:
        date = defaultdate
    ii = ItemIndex(date)
    top10 = Counter()
    itemdeals = Counter()
    for cate1 in topcids:
        for itemid, sales in ii.getindex(cate1, 'all', 'sales', 'mon'):
            top10[itemid] = sales
            itemdeals[itemid] = int(ii.getinfo(itemid, cate1).get('deals_mon', 0))
    top10item = []
    for itemid, sales in top10.most_common(10):
        deals = itemdeals[itemid]
        top10item.append((int(itemid), deals, sales))
    db.execute('''insert into ataobao2.top10 (datestr, field, value) values
                    (:datestr, :field, :value)''', dict(datestr=date, field='item', value=json.dumps(top10item)))
示例#4
0
def save_iteminfos(date, *itemids):
    ii = ItemIndex(date)
    for itemid in itemids:
        try:
            save_iteminfo(date, ii, itemid)
        except:
            traceback.print_exc()
示例#5
0
def top10_items(date=None):
    if date is None:
        date = defaultdate
    ii = ItemIndex(date)
    top10 = Counter()
    itemdeals = Counter()
    for cate1 in topcids:
        for itemid, sales in ii.getindex(cate1, 'all', 'sales', 'mon'):
            top10[itemid] = sales
            itemdeals[itemid] = int(
                ii.getinfo(itemid, cate1).get('deals_mon', 0))
    top10item = []
    for itemid, sales in top10.most_common(10):
        deals = itemdeals[itemid]
        top10item.append((int(itemid), deals, sales))
    db.execute(
        '''insert into ataobao2.top10 (datestr, field, value) values
                    (:datestr, :field, :value)''',
        dict(datestr=date, field='item', value=json.dumps(top10item)))
示例#6
0
def aggregate_items(start, end, hosts=[], date=None, retry=0):
    if retry >= 20:
        raise Exception('retry too many times, give up')

    if start > end:
        aggregate_items(start, 2**63-1, hosts, date, retry)
        aggregate_items(-2**63, end, hosts, date, retry)

    try:
        db = getdb()
        if date is None:
            date = defaultdate
        datestr = date
        date2 = datetime.strptime(date, "%Y-%m-%d")+timedelta(hours=16)
        date1 = date2 - timedelta(days=60)
        si = ShopIndex(date)
        ii = ItemIndex(date)
        bi = BrandIndex(date)
        ci = CategoryIndex(date)
        si.multi()
        ii.multi()
        bi.multi()
        ci.multi()

        try:
            if hosts:
                d2 = calendar.timegm(date2.utctimetuple())*1000
                d1 = calendar.timegm(date1.utctimetuple())*1000
                host = hosts[0]
                conn = db.get_connection(host)
                cur = conn.cursor()
                cur.execute('''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type
                    from ataobao2.item where token(id)>=:start and token(id)<:end''',
                    dict(start=int(start), end=int(end)))
                iteminfos = list(cur)
                cur.execute('''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date 
                    where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''',
                    dict(start=int(start), end=int(end), date1=d1, date2=d2))
                itemts = list(cur)
                conn.close()
            else:
                iteminfos = db.execute('''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type
                    from ataobao2.item where token(id)>=:start and token(id)<:end''',
                    dict(start=int(start), end=int(end)), result=True).results
                itemts = db.execute('''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date 
                    where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''',
                    dict(start=int(start), end=int(end), date1=d1, date2=d2), result=True).results
        except:
            print('cluster error on host {}, range {}, retry {}, sleeping 5 secs...'.format(hosts[0], (start, end), retry))
            hosts = hosts[-1:] + hosts[:-1]
            #traceback.print_exc()
            time.sleep(30)
            return aggregate_items(start, end, date=date, hosts=hosts, retry=retry+1)


        itemtsdict = {}
        for row in itemts:
            itemid, date, values = row[0], row[1], list(row[2:])
            # fix data malform
            # 1. num_colllects, index at 0, should not larger than 2**24 ~ 16 million
            if values[0] > 2**24:
                values[0] = 0
            if isinstance(date, datetime):
                date = (date+timedelta(hours=8)).strftime("%Y-%m-%d")
            else:
                date = datetime.utcfromtimestamp(struct.unpack('!q', date)[0]/1000)
                date = (date+timedelta(hours=8)).strftime("%Y-%m-%d")
            if itemid not in itemtsdict:
                itemtsdict[itemid] = {}
            itemtsdict[itemid][date] = values


        for itemid, shopid, cid, nc, price, brand, name, image, nr, credit_score, title, type in iteminfos:
            if in_blacklist(shopid, price, cid, nc, nr, credit_score, title, type, itemid=itemid):
                #print itemid, 'skiped'
                continue
            brand = clean_brand(brand)
            if nc > 0 and itemid in itemtsdict and itemtsdict[itemid]:
                try:
                    if shopid == 0:
                        db.execute('delete from ataobao2.item where id=:id', dict(id=itemid))
                        db.execute('delete from ataobao2.item_by_date where id=:id', dict(id=itemid))
                        continue
                except:
                    traceback.print_exc()
                try:
                    aggregate_item(si, ii, bi, ci, itemid, itemtsdict[itemid], shopid, cid, price, brand, name, image, datestr)
                except:
                    traceback.print_exc()

        si.execute()
        bi.execute()
        ci.execute()
        ii.execute()
    except:
        traceback.print_exc()
示例#7
0
def aggregate_items(start, end, hosts=[], date=None, retry=0):
    if retry >= 20:
        raise Exception('retry too many times, give up')

    if start > end:
        aggregate_items(start, 2**63 - 1, hosts, date, retry)
        aggregate_items(-2**63, end, hosts, date, retry)

    try:
        db = getdb()
        if date is None:
            date = defaultdate
        datestr = date
        date2 = datetime.strptime(date, "%Y-%m-%d") + timedelta(hours=16)
        date1 = date2 - timedelta(days=60)
        si = ShopIndex(date)
        ii = ItemIndex(date)
        bi = BrandIndex(date)
        ci = CategoryIndex(date)
        si.multi()
        ii.multi()
        bi.multi()
        ci.multi()

        try:
            if hosts:
                d2 = calendar.timegm(date2.utctimetuple()) * 1000
                d1 = calendar.timegm(date1.utctimetuple()) * 1000
                host = hosts[0]
                conn = db.get_connection(host)
                cur = conn.cursor()
                cur.execute(
                    '''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type
                    from ataobao2.item where token(id)>=:start and token(id)<:end''',
                    dict(start=int(start), end=int(end)))
                iteminfos = list(cur)
                cur.execute(
                    '''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date 
                    where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''',
                    dict(start=int(start), end=int(end), date1=d1, date2=d2))
                itemts = list(cur)
                conn.close()
            else:
                iteminfos = db.execute(
                    '''select id, shopid, cid, num_sold30, price, brand, title, image, num_reviews, credit_score, title, type
                    from ataobao2.item where token(id)>=:start and token(id)<:end''',
                    dict(start=int(start), end=int(end)),
                    result=True).results
                itemts = db.execute(
                    '''select id, date, num_collects, num_reviews, num_sold30, num_views, price from ataobao2.item_by_date 
                    where token(id)>:start and token(id)<=:end and date>=:date1 and date<:date2 allow filtering''',
                    dict(start=int(start), end=int(end), date1=d1, date2=d2),
                    result=True).results
        except:
            print(
                'cluster error on host {}, range {}, retry {}, sleeping 5 secs...'
                .format(hosts[0], (start, end), retry))
            hosts = hosts[-1:] + hosts[:-1]
            #traceback.print_exc()
            time.sleep(30)
            return aggregate_items(start,
                                   end,
                                   date=date,
                                   hosts=hosts,
                                   retry=retry + 1)

        itemtsdict = {}
        for row in itemts:
            itemid, date, values = row[0], row[1], list(row[2:])
            # fix data malform
            # 1. num_colllects, index at 0, should not larger than 2**24 ~ 16 million
            if values[0] > 2**24:
                values[0] = 0
            if isinstance(date, datetime):
                date = (date + timedelta(hours=8)).strftime("%Y-%m-%d")
            else:
                date = datetime.utcfromtimestamp(
                    struct.unpack('!q', date)[0] / 1000)
                date = (date + timedelta(hours=8)).strftime("%Y-%m-%d")
            if itemid not in itemtsdict:
                itemtsdict[itemid] = {}
            itemtsdict[itemid][date] = values

        for itemid, shopid, cid, nc, price, brand, name, image, nr, credit_score, title, type in iteminfos:
            if in_blacklist(shopid,
                            price,
                            cid,
                            nc,
                            nr,
                            credit_score,
                            title,
                            type,
                            itemid=itemid):
                #print itemid, 'skiped'
                continue
            brand = clean_brand(brand)
            if nc > 0 and itemid in itemtsdict and itemtsdict[itemid]:
                try:
                    if shopid == 0:
                        db.execute('delete from ataobao2.item where id=:id',
                                   dict(id=itemid))
                        db.execute(
                            'delete from ataobao2.item_by_date where id=:id',
                            dict(id=itemid))
                        continue
                except:
                    traceback.print_exc()
                try:
                    aggregate_item(si, ii, bi, ci, itemid, itemtsdict[itemid],
                                   shopid, cid, price, brand, name, image,
                                   datestr)
                except:
                    traceback.print_exc()

        si.execute()
        bi.execute()
        ci.execute()
        ii.execute()
    except:
        traceback.print_exc()