def check_case_3(): # 无点击, 连续3天有展现,点击为零 csv_data_list = [] csv_title = ['shop_id', 'camp_id', 'item_id', 'adg_id', '3_day_impr', '3_day_click'] local_camp_list = Campaign.objects(Q(online_status = 'online')).order_by('+campaign_id') # 查询所有开启的计划 for temp_camp_list in genr_sublist(local_camp_list, 10): for local_camp in temp_camp_list: local_adg_list = Adgroup.objects(Q(online_status = 'online') & Q(campaign_id = local_camp.campaign_id)).sum_reports(rpt_days = 3) # 查询所有开启的推广组 for temp_adg_list in genr_sublist(local_adg_list, 10): for local_adg in temp_adg_list: print 'camp_id=%s adg_id=%s qr.impr=%s qr.click=%s' % (local_camp.campaign_id, local_adg.adgroup_id, local_adg.qr.impressions, local_adg.qr.click) if local_adg.qr.impressions > 0 and local_adg.qr.click == 0: csv_data_list.append([ local_adg.shop_id, local_adg.campaign_id, local_adg.item_id, local_adg.adgroup_id, local_adg.qr.impressions, local_adg.qr.click]) export_to_file('(check_case_3)', csv_title, csv_data_list)
def check_case_2(): # 质量得分低, 平均质量得分小于6.5,小于5分的词大于40% csv_data_list = [] csv_title = ['shop_id', 'camp_id', 'item_id', 'adg_id', 'kw_avg', 'kw_rate'] local_camp_list = Campaign.objects(Q(online_status = 'online')).order_by('+campaign_id') # 查询所有开启的计划 for temp_camp_list in genr_sublist(local_camp_list, 10): for local_camp in temp_camp_list: local_adg_list = Adgroup.objects(Q(online_status = 'online') & Q(campaign_id = local_camp.campaign_id)) # 查询所有开启的推广组 for temp_adg_list in genr_sublist(local_adg_list, 10): for local_adg in temp_adg_list: avg = get_avg_qscore(local_adg.adgroup_id) rate = get_qscore_rate(local_adg.adgroup_id) print 'camp_id=%s adg_id=%s avg=%s rate=%s' % (local_camp.campaign_id, local_adg.adgroup_id, avg, rate) if avg and rate and avg < 6.5 and rate > 0.4: csv_data_list.append([ local_adg.shop_id, local_adg.campaign_id, local_adg.item_id, local_adg.adgroup_id, avg, rate]) export_to_file('(check_case_2)', csv_title, csv_data_list)
def check_case_1(): # 词少, 关键词少于100个 csv_data_list = [] csv_title = ['shop_id', 'camp_id', 'item_id', 'adg_id', 'kw_count'] local_camp_list = Campaign.objects(online_status = 'online').order_by('+campaign_id') # 查询所有开启的计划 for temp_camp_list in genr_sublist(local_camp_list, 10): for local_camp in temp_camp_list: local_adg_list = Adgroup.objects(online_status = 'online', campaign_id = local_camp.campaign_id).order_by('adgroup_id') # 查询所有开启的推广组 for temp_adg_list in genr_sublist(local_adg_list, 10): for local_adg in temp_adg_list: local_kw_list = Keyword.objects.only('word').filter(adgroup_id = local_adg.adgroup_id) kw_count = len(local_kw_list) print 'camp_id=%s adg_id=%s kw_count=%s' % (local_camp.campaign_id, local_adg.adgroup_id, kw_count) if kw_count < 100: csv_data_list.append([local_adg.shop_id, local_adg.campaign_id, local_adg.item_id, local_adg.adgroup_id, kw_count]) export_to_file('(check_case_1)', csv_title, csv_data_list)
def repair_longmnt(): from apps.common.utils.utils_collection import genr_sublist from apps.subway.models_account import Account from apps.subway.models_adgroup import adg_coll shops = Account.objects.only('shop_id').all() shop_ids = [s.shop_id for s in shops] total_count = len(shop_ids) cur_count = 0 for shop_id_list in genr_sublist(shop_ids, 50): adg_coll.update( { 'shop_id': { '$in': shop_id_list }, 'use_camp_limit': 0, 'mnt_type': { '$in': [1, 3] } }, {'$set': { 'use_camp_list': 1 }}, multi=True) cur_count += 50 print 'total_count=%s, cur_count=%s, %s%%, last_shop_id=%s' % ( total_count, cur_count, round(cur_count / total_count, 4) * 100, shop_id_list[-1]) print 'ok'
def load_redis_newcat_word_2memcache(cls): ''' .实时导入新词数据到memcache ''' from apps.kwslt.select_words import MemcacheAdpter r = WordCat.r_wckeyword for cat_id in r.smembers('new_cat_word_set'): count = MemcacheAdpter.get_list_count(str(cat_id), 'kwlib') word_list = [word.decode('utf8') for word in r.lrange('%s_new_word' % cat_id, 0, -1)] if word_list: word_list = WordCat.get_wordcat_data_2memcache(word_list, cat_id) if count: cache_word_list = CacheAdpter.get('%s_%s' % (cat_id, count - 1), 'kwlib') if len(cache_word_list) < 4500: word_list = cache_word_list + word_list count = count - 1 else: count = 0 for wl in genr_sublist(word_list, 4500): CacheAdpter.set('%s_%s' % (cat_id, count), wl, 'kwlib') count += 1 CacheAdpter.set(str(cat_id), count, 'kwlib') r.delete('%s_new_word' % cat_id) r.delete('new_cat_word_set')
def clean_garbage_word(cls, key): ''' .清除长时间没有获取到全网数据的关键词 ''' word_list = [kw.decode('utf8') for kw in cls.r_keyword.lrange(key, 0, -1)] insert_list, delete_list = [], [] for word in word_list: tmp_list = word.split(':') wd, sort_word = tmp_list[0], tmp_list[1] min_update_time = datetime.datetime.now() - datetime.timedelta(days = 200) update_time = cls.r_hkeyword.hget(sort_word, 'upt_tm') if update_time == None: delete_list.append(sort_word) continue if not cmp(datetime.datetime.strptime(update_time, "%Y-%m-%d"), min_update_time): delete_list.append(sort_word) continue else: insert_list.append(wd + ':' + sort_word) cls.r_hkeyword.delete(*delete_list) cls.r_keyword.delete(key) for word_list in genr_sublist(insert_list, 10000): key_keyword_list = RedisKeyManager.get_keyword_list_key(cls.NEWKEYWORD_ALIAS, cls.NEW_KW_LIST_PREV_KEY) RedisKeyManager.redis_lpush(cls.r_nkeyword, key_keyword_list, word_list)
def check_case_6(): # 点击率低, 点击率低于市场点击率的70% csv_data_list = [] csv_title = ['shop_id', 'camp_id', 'item_id', 'adg_id', '7_day_ctr', 'cat_avg_ctr'] local_camp_list = Campaign.objects(Q(online_status = 'online')).order_by('+campaign_id') # 查询所有开启的计划 for temp_camp_list in genr_sublist(local_camp_list, 10): for local_camp in temp_camp_list: local_adg_list = Adgroup.objects(Q(online_status = 'online') & Q(campaign_id = local_camp.campaign_id)).sum_reports(rpt_days = 7) # 查询所有开启的推广组 for temp_adg_list in genr_sublist(local_adg_list, 10): for local_adg in temp_adg_list: item = local_adg.item_id cat_data = local_adg.cat_data ctr = local_adg.qr.ctr cat_avg_ctr = get_cat_avg_ctr(item, cat_data) print 'camp_id=%s adg_id=%s ctr=%s cat.avg_ctr=%s' % (local_camp.campaign_id, local_adg.adgroup_id, ctr, cat_avg_ctr) if cat_avg_ctr and local_adg.qr.ctr < cat_avg_ctr * 0.7: csv_data_list.append([ local_adg.shop_id, local_adg.campaign_id, local_adg.item_id, local_adg.adgroup_id, ctr]) export_to_file('(check_case_6)', csv_title, csv_data_list)
def check_case_7(): # 点击转化率低, 点击转化率低于市场点击率的100% csv_data_list = [] csv_title = ['shop_id', 'camp_id', 'item_id', 'adg_id', 'kw_count', '7_day_conv', 'cat_avg_conv'] local_camp_list = Campaign.objects(Q(online_status = 'online')).order_by('+campaign_id') # 查询所有开启的计划 for temp_camp_list in genr_sublist(local_camp_list, 10): for local_camp in temp_camp_list: local_adg_list = Adgroup.objects(Q(online_status = 'online') & Q(campaign_id = local_camp.campaign_id)).sum_reports(rpt_days = 7) # 查询所有开启的推广组 for temp_adg_list in genr_sublist(local_adg_list, 10): for local_adg in temp_adg_list: item = local_adg.item cat_data = local_adg.cat_data conv = local_adg.qr.conv cat_avg_conv = get_cat_avg_conv(item, cat_data) print 'camp_id=%s adg_id=%s qr.conv=%s cat.avg_conv=%s' % (local_camp.campaign_id, local_adg.adgroup_id, conv, cat_avg_conv) if cat_avg_conv and conv < cat_avg_conv * 1.0: csv_data_list.append([ local_adg.shop_id, local_adg.campaign_id, local_adg.item_id, local_adg.adgroup_id, conv, cat_avg_conv]) export_to_file('(check_case_7)', csv_title, csv_data_list)
def insert_new_word_list(cls, word_list): ''' .插入新的关键词 ''' insert_list = cls.get_insert_list(word_list) if insert_list: for word_list in genr_sublist(insert_list, 10000): key_keyword_list = RedisKeyManager.get_keyword_list_key(cls.NEWKEYWORD_ALIAS, cls.NEW_KW_LIST_PREV_KEY) RedisKeyManager.redis_lpush(cls.r_nkeyword, key_keyword_list, word_list)
def struct_download(cls, shop_id, tapi): """初始化creative""" try: top_creative_list = cls.get_creatives_byadgids(shop_id=shop_id, tapi=tapi) local_crt_id_list = [ crt['_id'] for crt in crt_coll.find({'shop_id': shop_id}, {'_id': 1}) ] upd_crt_dict, insert_crt_list, old_crt_id_list = {}, [], [] for crt in top_creative_list: if crt.creative_id in local_crt_id_list: upd_crt_dict.update({ crt.creative_id: cls.Parser.parse(crt, trans_type='inc') }) old_crt_id_list.append(crt.creative_id) else: insert_crt_list.append( cls.Parser.parse(crt, trans_type='init', extra_dict={'shop_id': shop_id})) del_crt_id_list = list( set(local_crt_id_list) - set(old_crt_id_list)) for temp_insert_list in genr_sublist(insert_crt_list, 50): crt_coll.insert(temp_insert_list) update_list = [] for crt_id, update_info in upd_crt_dict.items(): update_list.append(({ 'shop_id': shop_id, '_id': crt_id }, { '$set': update_info })) Creative.bulk_update_crt2db(update_list) if del_crt_id_list: cls.remove_creative({ 'shop_id': shop_id, '_id': { '$in': del_crt_id_list } }) log.info('init creatives OK, shop_id = %s, get %s creatives' % (shop_id, len(top_creative_list))) return True except Exception, e: log.error('init creatives FAILED, shop_id = %s, e = %s' % (shop_id, e)) return False
def bulk_update_for_model(obj_list, commit_number = 1000): '''批量更新model_list数据,按照commit_number将obj_list分组批量更新''' for temp_list in genr_sublist(obj_list, commit_number): try: for obj in temp_list: obj.save(False, False) transaction.commit() except Exception, e: log.exception("Object save error, e=%s" % (e)) transaction.rollback() raise e
def bulk_update_db(cls, update_list): total_updated_num = 0 for temp_list in genr_sublist(update_list, 1000): # bulk一次最多1000个 bulk = cls._get_collection().initialize_unordered_bulk_op() for update_tuple in temp_list: bulk.find(update_tuple[0]).update(update_tuple[1]) try: result = bulk.execute() total_updated_num += result['nModified'] except BulkWriteError, e: log.error('bulk_update_kw2db, detail=%s' % e.details) total_updated_num += e.details['nModified']
def check_case_5(): # PPC高, 高于市场均价的1.2倍,大于 min(客单价*行业转化率/roi, 每个 关键词点击量*行业ppc 的和/所有点击, 日限额/50) csv_data_list = [] csv_title = ['shop_id', 'camp_id', 'item_id', 'adg_id', 'ppc', 'cat_avg_ppc', 'roi', 'cat_avg_conv', 'budget'] local_camp_list = Campaign.objects(Q(online_status = 'online')).order_by('+campaign_id') # 查询所有开启的计划 for temp_camp_list in genr_sublist(local_camp_list, 10): for local_camp in temp_camp_list: local_adg_list = Adgroup.objects(Q(online_status = 'online') & Q(campaign_id = local_camp.campaign_id)).sum_reports(rpt_days = 7) # 查询所有开启的推广组 for temp_adg_list in genr_sublist(local_adg_list, 10): for local_adg in temp_adg_list: item = local_adg.item cat_data = local_adg.cat_data ppc = local_adg.qr.cpc roi = local_adg.qr.roi cat_avg_ppc = get_cat_avg_cpc(item, cat_data) cat_avg_conv = get_cat_avg_conv(item, cat_data) result = calc_kw_1(local_adg) if local_adg.item and local_adg.item.price and cat_avg_ppc and cat_avg_conv and result and roi and ppc: print 'camp_id=%s adg_id=%s ppc=%s roi=%s cat_avg_ppc=%s cat_avg_conv=%s budget=%s' % (local_camp.campaign_id, local_adg.adgroup_id, ppc, roi, cat_avg_ppc, cat_avg_conv, local_camp.budget) if ppc > cat_avg_ppc * 1.2 and min(local_adg.item.price * cat_avg_conv / roi, result, local_camp.budget / 50): csv_data_list.append([ local_adg.shop_id, local_adg.campaign_id, local_adg.item_id, local_adg.adgroup_id, ppc, cat_avg_ppc, roi, cat_avg_conv, local_camp.budget]) export_to_file('(check_case_5)', csv_title, csv_data_list)
def check_case_4(): # 点击少, 7日均点击少于 max(10,取某个行业点击做为参考值) example_cat_std_click = 100 example_cat_click = get_cat_click_8id(150704) csv_data_list = [] csv_title = ['shop_id', 'camp_id', 'item_id', 'adg_id', '7_day_avg_click', 'cat_click', 'example_cat_std_click', 'example_cat_click', 'current_cat_click', 'expression'] local_camp_list = Campaign.objects(Q(online_status = 'online')).order_by('+campaign_id') # 查询所有开启的计划 for temp_camp_list in genr_sublist(local_camp_list, 10): for local_camp in temp_camp_list: local_adg_list = Adgroup.objects(Q(online_status = 'online') & Q(campaign_id = local_camp.campaign_id)).sum_reports(rpt_days = 7) # 查询所有开启的推广组 for temp_adg_list in genr_sublist(local_adg_list, 10): for local_adg in temp_adg_list: item = local_adg.item cat_data = local_adg.cat_data click = local_adg.qr.click / 7 cat_click = get_cat_all_click(local_adg) if cat_click and example_cat_std_click and example_cat_click: expression = example_cat_std_click * cat_click / example_cat_click print 'camp_id=%s adg_id=%s qr.click=%s cat.click=%s example_cat_std_click=%s example_cat_click=%s expression=%s' % (local_camp.campaign_id, local_adg.adgroup_id, click, cat_click, example_cat_std_click, example_cat_click, expression) if click < max(10, expression): csv_data_list.append([ local_adg.shop_id, local_adg.campaign_id, local_adg.item_id, local_adg.adgroup_id, click, cat_click, example_cat_std_click, example_cat_click, cat_click, expression]) export_to_file('(check_case_4)', csv_title, csv_data_list)
def get_catinfo_new(select_type, category_id_list=[] ): # TODO 该返回值信息量很大,所以是否要保留类目树需要待定,或者修改类目树的基本结构,所以该操作暂时待定 """ 新的获取类目信息接口,获取类目信息,此接口既提供所有顶级类目的查询, 又提供给定类目id自身信息和子类目信息的查询,所以可以根据此接口逐层获取所有的类目信息 传入参数: type: 0 表示请求所有顶级类目的信息,这时可以忽略第二个参数 1 表示获取给定的类目id的详细信息 2 表示获取给定类目id的所有子类目的详细信息 category_id_list:以逗号隔开的类目id "16,30" 返回值: parent_cat_id:父类目id cat_name:类目名称 cat_path_name:类目路径名称 cat_id:类目id cat_level:类目层级 last_sync_time:最后一次同步时间 cat_path_id:类目路径id """ def get_cat_data_info(tobj, cat_dict): if tobj and hasattr(tobj, "category_info_list"): category_info_list = tobj.category_info_list if hasattr(category_info_list, "insight_category_info_d_t_o"): insight_category_info_d_t_o = category_info_list.insight_category_info_d_t_o for cat in insight_category_info_d_t_o: cat_id = cat.cat_id cat_dict[cat_id] = { 'cat_id': cat_id, 'cat_path_name': cat.cat_path_name, 'parent_cat_id': cat.parent_cat_id, 'cat_name': cat.cat_name, 'cat_level': cat.cat_level, 'last_sync_time': cat.last_sync_time, 'cat_path_id': cat.cat_path_id } return cat_dict tobj = None cat_dict = {} try: if category_id_list: for tmp_list in genr_sublist(category_id_list, 10): tobj = tsapi.simba_insight_catsinfo_get( type=select_type, category_id_list=','.join(tmp_list)) cat_dict = get_cat_data_info(tobj, cat_dict) else: tobj = tsapi.simba_insight_catsinfo_get(type=select_type) cat_dict = get_cat_data_info(tobj, cat_dict) except TopError, e: log.error('get simba_insight_catsinfo_get error, e=%s' % (e)) return {}
def bulk_update_mongodb( coll, update_list ): # update_list形如[({'_id':1024321654}, {'$set':{'max_price':24}}), ({'_id':1024321651}, {'$set':{'max_price':47}}),...] total_updated_num = 0 for temp_list in genr_sublist(update_list, 1000): # bulk一次最多1000个 bulk = coll.initialize_unordered_bulk_op() for update_tuple in temp_list: bulk.find(update_tuple[0]).update(update_tuple[1]) try: result = bulk.execute() total_updated_num += result['nModified'] except BulkWriteError, e: log.error('bulk_update_mongodb, detail=%s' % e.details) total_updated_num += e.details['nModified']
def bulk_update_for_sql(sql, value_list, commit_number = 1000): '''批量执行sql语句保存数据,按照commit_number将value_list分组批量更新''' rowcount = 0 for group_value_list in genr_sublist(value_list, commit_number): if group_value_list: try: cursor = connection.cursor() for temp_list in group_value_list: rowcount += cursor.executemany(sql, temp_list) # sql语句里的字符串型参数 %s 两旁不加引号! transaction.commit() except Exception, e: log.exception("SQL update error, e=%s" % (e)) transaction.rollback() raise e
def set_large_list(key, value_list, cache_name='kwlib', timeout=24 * 60 * 60 * 30, count=4500): use_cache = get_cache(cache_name) # 分解成多组 value_map = {} i = 0 for temp_sublist in genr_sublist(value_list, count): value_map.update({'%s_%s' % (key, i): temp_sublist}) i += 1 # 存入内存 use_cache.set_many(value_map, timeout) # 存入数据 use_cache.set(key, i, timeout) # 存入组数
def cat_data_list(cat_id_list, start_date, end_date): # TODO 需要测试起始时间和终止时间的最长时间和最短时间 """ 根据类目id获取类目的大盘数据,其中cpc, ctr, cvr, roi这几个指标数据是真实数据,其它的数据都是通过指数化后的数据, 其中competition这个字段的目前无法做到精确统计,只是一个参考值,本次提供的insight相关的其它接口的都是这种情况。 cat_id_list格式为:"16,30"以逗号分割 start_data格式为:yyyy-MM-dd 起始时间 end_data格式为:yyyy-MM-dd 终止时间 返回值: impression:展现量 click:点击量 cost:花费,单位(分) directtransaction:直接成交额 indirecttransaction:间接成交额 directtransactionshipping:直接成交笔数 indirecttransactionshipping:间接成交笔数 favitemtotal:宝贝收藏数 favshoptotal:店铺收藏数 transactionshippingtotal:总的成交笔数 transactiontotal:成交总金额 favtotal:总的收藏数,包括宝贝收藏和店铺收藏 competition:竞争度 ctr:点击率 cpc:平均点击花费 roi:投入产出比 coverage:点击转化率 cat_id:类目id cat_name:类目名称 """ cat_id_list = [str(cat_id) for cat_id in cat_id_list] top_cat_data_dict = {} for cat_list in genr_sublist(cat_id_list, 5): try: tobj = tsapi.simba_insight_catsdata_get( category_id_list=','.join(cat_list), start_date=start_date, end_date=end_date) if hasattr(tobj, "cat_data_list"): cat_data_list = tobj.cat_data_list if hasattr(cat_data_list, "insight_category_data_d_t_o"): insight_category_data_d_t_o = cat_data_list.insight_category_data_d_t_o for cat_data in insight_category_data_d_t_o: top_cat_data_dict[cat_data.cat_id] = cat_data except TopError, e: log.error('get simba_insight_catsdata_get error, e=%s' % (e)) continue
def set_large_list(cls, key, value_list, split_size, cache_name, timeout=0): if value_list: use_cache = get_cache(cache_name) value_map = {} for index, temp_sublist in enumerate( genr_sublist(value_list, split_size)): value_map.update({'%s_%s' % (key, index): temp_sublist}) use_cache.set_many(value_map, timeout) # 存入分解后的数据 use_cache.set(key, index + 1, timeout) # 被分解的N组 return True
def __get_creative_bycrtids(cls, shop_id, creative_id_list, tapi): """ 通过creative_id获取创意 """ creative_list = [] for temp_crt_id_list in genr_sublist(creative_id_list, 200): creative_ids = ','.join( str(creative_id) for creative_id in temp_crt_id_list) try: top_objs = tapi.simba_creatives_get( creative_ids=creative_ids, retry_count=settings.TAPI_RETRY_COUNT * 6, retry_delay=settings.TAPI_RETRY_DELAY) if top_objs and hasattr(top_objs, 'creatives') and hasattr( top_objs.creatives, 'creative') and top_objs.creatives.creative: creative_list.extend(top_objs.creatives.creative) except TopError, e: log.error( 'simba_creatives_get TopError, shop_id = %s, e = %s' % (shop_id, e))
def get_cats_forecast_new(word_list): """ 根据传入的关键词获取该关键词适合的类目 新的类目预测接口,传入参数为纯word以逗号隔开,淘宝没有提供传入字符串长度,所以默认长度为原有的200以内。 淘宝返回值: cat_path_name:类目路径及名称 bidword:关键词 score:类目相关度 cat_path_id:类目路径id 当前返回值: { '连衣裙':[1,2,3,4], ... .. } """ cat_forecast_dict = {} for temp_list in genr_sublist(word_list, 100): try: tobj = tsapi.simba_insight_catsforecastnew_get( bidword_list=",".join(temp_list)) if hasattr(tobj, "category_forecast_list"): category_forecast_list = tobj.category_forecast_list if hasattr(category_forecast_list, "insight_category_forcast_d_t_o"): insight_category_forcast_d_t_o = category_forecast_list.insight_category_forcast_d_t_o for catforecast in insight_category_forcast_d_t_o: cat_path_id = catforecast.cat_path_id if cat_path_id == "": continue cat_id = int(catforecast.cat_path_id.split(' ')[-1]) word = catforecast.bidword if catforecast.bidword in cat_forecast_dict: cat_forecast_dict[word].append(cat_id) else: cat_forecast_dict[word] = [cat_id] except TopError, e: log.error('get simba_insight_catsforecastnew_get error, e=%s' % (e)) continue
def get_relatewords_new(word_list, number=10): # TODO 调用长度是一个非常严重的问题 """ 获取给定词的若干相关词,返回结果中越相关的权重越大,排在越前面,根据number参数对返回结果进行截断。 以词滚词,扩充词库 传入参数: bidword_list:以逗号分割的关键词字符串 number:返回个数 返回值: related_word:相关关键词 weight:相关度 函数返回值: [ 连衣裙, 连衣裙夏, ... .. . ] """ relate_list = [] for temp_list in genr_sublist(word_list, 100): try: tobj = tsapi.simba_insight_relatedwords_get( bidword_list=','.join(temp_list), number=number) if hasattr(tobj, "related_words_result_list"): related_words_result_list = tobj.related_words_result_list if hasattr(related_words_result_list, "insight_related_words"): insight_related_words = related_words_result_list.insight_related_words for relate_word in insight_related_words: if hasattr(relate_word, "related_word_items_list"): related_word_items_list = relate_word.related_word_items_list if hasattr(related_word_items_list, "insight_related_word"): insight_related_word = related_word_items_list.insight_related_word for word in insight_related_word: relate_list.append( word.related_word.replace( '\t', '').decode('utf8')) except TopError, e: log.error('get simba_insight_relatedwords_get error, e=%s' % (e))
def generate_task(self): adg_date_list = collections.defaultdict(list) query_dict = {'shop_id': self.shop_id, 'adgroup_id': {'$in': self.adgroup_id_list}} result = Adgroup.Report.aggregate_rpt(query_dict= query_dict, group_keys = "adgroup_id,date", start_date = str(self.earliest_date), end_date = str(self.end_date)) for adg_rpt in result: temp_adg_id = adg_rpt['_id']['adgroup_id'] temp_datetime = adg_rpt['_id']['date'] # mongodb包含时分秒,去掉,转为date temp_date = datetime.date(temp_datetime.year, temp_datetime.month, temp_datetime.day) if temp_date >= self.adg_date_dict[temp_adg_id]: # 判断这个广告组的关键词时间是否是在上次同步之外的 adg_date_list[temp_adg_id].append(temp_date) for adg_id, adg_date_list in adg_date_list.items(): adg_date_list.sort() for temp_time_scope in genr_sublist(adg_date_list, 5): for search_type, source in KeywordRpt.REPORT_CFG: desc = source == "SUMMARY" and "summary" or "detail" # 明细数据与汇总数据分成两个任务 start_time = str(temp_time_scope[0])[:10] end_time = str(temp_time_scope[-1])[:10] cache_key = "%s_%s_%s_%s" % (adg_id, start_time, end_time, desc) self.task_list.append({ 'shop_id': self.shop_id, 'campaign_id': self.campaign_id, 'token': self.token, 'adgroup_id': adg_id, 'source': source, 'search_type': search_type, 'start_time': start_time, 'end_time': end_time, 'cache_db': self.cache_db, 'cache_key': cache_key }) self.key_list.append(cache_key)
continue if not isinstance(chk['max']['cat_id'], int): chk['max']['cat_id'] = 100000000 if chk['min']['cat_id'] == chk['max']['cat_id']: if chk['min']['cat_id'] in shard_dict['shard']: continue shard_dict['shard'].append(chk['min']['cat_id']) continue min_max_list = [chk['min']['cat_id'], chk['max']['cat_id']] key = chk['shard'] if shard_dict.has_key(key): shard_dict[key].append(min_max_list) else: shard_dict[key] = [min_max_list] conn.disconnect() shard_list = genr_sublist(shard_dict['shard'], 4) del shard_dict['shard'] count = 0 index = 0 for key in shard_dict: conn = pymongo.Connection(shard_conn_ip[key]) kwlib = conn.kwlib kwlib.authenticate('PS_kwlibAdmin', 'PS_managerKwlib') cat_coll = kwlib.kwlib_catinfo cat_list = [] for max_min_list in shard_dict[key]: for cat in cat_coll.find( {'cat_id': { '$gte': max_min_list[0], '$lte': max_min_list[1] }}):
def get_words_gdata(word_list, time_scope=None): """ 获取关键词的详细数据,全网数据接口 传入参数: bidword_list:以逗号分割的字符串列表,如“连衣裙,红色连衣裙...” start_date:起始时间 end_date:终止时间 返回值: impression:展现量 click:点击量 cost:花费,单位 分 directtransaction:直接成交金额 indirecttransaction:间接成交金额 directtransactionshipping:直接成交笔数 indirecttransactionshipping:间接成交笔数 favitemtotal:宝贝收藏数 favshoptotal:店铺收藏数 transactionshippingtotal:总的成交笔数 1 transactiontotal:成交总金额 favtotal:总的收藏数,包括宝贝收藏数以及店铺收藏数 1 competition:竞争度 ctr:点击率 cpc:平均点击花费 roi:投资回报率 1 string coverage:点击转化率 1 string bidword:关键词 函数返回值: { '连衣裙':{'pv':1, 'click':1, 'cpc':1, 'ctr':1, 'competition':1, 'last_update_time':1} ... .. . } """ if not time_scope: # 不给时间区间,就默认采用昨天的数据 yst_date = '%s' % (datetime.date.today() - datetime.timedelta(days=1)) time_scope = (yst_date, yst_date) word_dict = {} for temp_list in genr_sublist(word_list, 100): try: tobj = tsapi.simba_insight_wordsdata_get( bidword_list=','.join(temp_list), start_date=time_scope[0], end_date=time_scope[1]) if hasattr(tobj, "word_data_list"): word_data_list = tobj.word_data_list if hasattr(word_data_list, "insight_word_data_d_t_o"): insight_word_data_d_t_o = word_data_list.insight_word_data_d_t_o for word_data in insight_word_data_d_t_o: cpc = int( float( getattr( word_data, "cpc", word_data.click > 0 and word_data.cost / word_data.click or 0))) ctr = float(getattr(word_data, "ctr", 0)) competition = int( float(getattr(word_data, "competition", 0))) roi = float(getattr(word_data, "roi", 0)) coverage = float(getattr(word_data, "coverage", 0)) favtotal = int(float(getattr(word_data, "favtotal", 0))) transactionshippingtotal = int( float( getattr(word_data, "transactionshippingtotal", 0))) word_dict[word_data.bidword] = DictWrapper({ 'pv': int(word_data.impression), 'click': int(word_data.click), 'avg_price': cpc, 'ctr': ctr, 'competition': competition, 'word': word_data.bidword, 'roi': roi, 'coverage': coverage, 'favtotal': favtotal, 'transactionshippingtotal': transactionshippingtotal }) except TopError, e: if "API error response" in str(e): log.info("test error for :" + str(datetime.datetime.now()) + ',\t' + ','.join(temp_list)) log.error('simba_insight_wordsdata_get TopError, e=%s' % (e))
def cleanup_expired(request=None): """清除过期数据,包括: 1.清除掉过期15天内的用户所有数据 2.部分表,根据时间清理数据(通常是30天) """ def is_outservice(shop_id): expire_days = Config.get_value('web.clean_up.OUTSERVICE_EXPIRE_DAYS', default=15) deadline_query_sql = "select deadline from router_articleusersubscribe where nick=(select nick from router_user where shop_id=%s) order by deadline desc limit 1" # 检查是否可删除 deadline_list = execute_query_sql(deadline_query_sql % shop_id) for tmp_deadline in deadline_list: deadline = tmp_deadline["deadline"] if time_is_ndays_interval(deadline, expire_days): return True return False def remove_shopdata(shop_id_list): """移除多个店铺数据""" from apps.subway.models import account_coll, camp_coll, adg_coll, crt_coll, ccrt_coll, kw_coll, item_coll, attn_coll, uprcd_coll # 移除基本的结构(含报表)数据 # account_coll.remove({'_id':{'$in':shop_id_list}}) # 店铺数据不清理 camp_coll.remove({'shop_id': {'$in': shop_id_list}}) # 计划数据 adg_coll.remove({'shop_id': {'$in': shop_id_list}}) # 推广级数据 crt_coll.remove({'shop_id': {'$in': shop_id_list}}) # 创意数据 ccrt_coll.remove({'shop_id': {'$in': shop_id_list}}) # 自定义创意 kw_coll.remove({'shop_id': {'$in': shop_id_list}}) # 关键词数据 item_coll.remove({'shop_id': {'$in': shop_id_list}}) # 宝贝数据 # 移除报表数据 from apps.subway.models_report import acctrpt_coll, camprpt_coll, adgrpt_coll, crtrpt_coll, kwrpt_coll # acctrpt_coll.remove({'shop_id': {'$in': shop_id_list}}) # 店铺报表不清理 camprpt_coll.remove({'shop_id': {'$in': shop_id_list}}) adgrpt_coll.remove({'shop_id': {'$in': shop_id_list}}) crtrpt_coll.remove({'shop_id': {'$in': shop_id_list}}) kwrpt_coll.remove({'shop_id': {'$in': shop_id_list}}) # 移除算法/功能相关的数据 attn_coll.remove({'_id': {'$in': shop_id_list}}) # 关注数据 # 清除抢排名设置和历史记录 from apps.engine.models_channel import MessageChannel from apps.engine.models_kwlocker import kw_locker_coll kw_cur = kw_locker_coll.find({'shop_id': { '$in': shop_id_list }}, {'_id': 1}) kw_id_list = [kw['_id'] for kw in kw_cur] MessageChannel.delete_msg_history(kw_id_list) kw_locker_coll.remove({'shop_id': {'$in': shop_id_list}}) from apps.engine.models import shopmng_task_coll shopmng_task_coll.remove({'_id': {'$in': shop_id_list}}) # 店铺大任务数据 from apps.mnt.models import mnt_camp_coll, mnt_task_coll mnt_camp_coll.remove({'shop_id': {'$in': shop_id_list}}) # 全自动计划数据 mnt_task_coll.remove({'shop_id': {'$in': shop_id_list}}) # 全自动任务数据 uprcd_coll.remove({'shop_id': {'$in': shop_id_list}}) # 操作记录数据 from apps.alg.models import optrec_coll optrec_coll.remove({'shop_id': {'$in': shop_id_list}}) #全自动优化分析记录 from apps.subway.download import dler_coll dler_coll.remove({'_id': {'$in': shop_id_list}}) # 下载数据 from apps.crm.models import psmsg_coll psmsg_coll.remove({'shop_id': {'$in': shop_id_list}}) # 留言信息 try: user_list = User.objects.filter(nick__in=shop_id_list) user_id_list, uid_list, username_list = [], [], [] for user in user_list: user_id_list.append(user.id) uid_list.append(user.first_name) username_list.append(user.nick) from apps.router.models import Agent, AccessToken, AdditionalPermission, NickPort, Shop Agent.objects.filter(principal__in=user_id_list).delete() # 用户代理 AccessToken.objects.filter(uid__in=uid_list, platform='web').delete() # 千牛session AdditionalPermission.objects.filter( user__in=user_id_list).delete() # 额外授权 NickPort.objects.filter(nick__in=username_list).delete() # 属服务器 Shop.objects.filter(sid__in=shop_id_list).delete() # 店铺信息 except Exception: pass def remove_outdated(): last_date = datetime.date.today() - datetime.timedelta(30) default_deadline = date_2datetime(last_date) from django.contrib.sessions.models import Session Session.objects.filter(expire_date__lte=default_deadline).delete() from apps.subway.models_report import AccountRpt, CampaignRpt, AdgroupRpt, CreativeRpt, KeywordRpt AccountRpt.clean_outdated() CampaignRpt.clean_outdated() AdgroupRpt.clean_outdated() CreativeRpt.clean_outdated() KeywordRpt.clean_outdated() from apps.subway.models_upload import UploadRecord UploadRecord.clean_outdated() from apps.alg.models import OptimizeRecord OptimizeRecord.clean_outdated() # 删除冻结超过30天的积分数据 from apps.web.models import PointActivity PointActivity.clean_outdated() smt_cursor = shopmng_task_coll.find({'status': 0}, {'_id': 1}) deactived_shop_id_list = [smt['_id'] for smt in smt_cursor] # 无法执行店铺任务的 expired_shop_id_list = [] log.info('need to check %s shops' % len(deactived_shop_id_list)) for temp_list in genr_sublist(deactived_shop_id_list, 100): expired_shop_id_list = [] for shop_id in temp_list: if is_outservice(shop_id): expired_shop_id_list.append(shop_id) if expired_shop_id_list: remove_shopdata(expired_shop_id_list) log.info('start remove outdated data') remove_outdated() log.info('all data cleaned OK')
def get_catsworddata( cat_id, word_list, start_date, end_date): # TODO 需要测试传入word_str的最大长度 起始时间和终止时间的时间间距,isp错误 连接超时 """ 获取关键词在类目下的数据 传入参数: cat_id:类目id str bidword_list:以^^分割的字符串,默认最长为200 start_date:起始时间 end_date:终止时间 taobao返回值: impression:展现量 click:点击量 cost:花费,单位 分 directtransaction:直接成交金额 indirecttransaction:间接成交金额 directtransactionshipping:直接成交笔数 indirecttransactionshipping:间接成交笔数 favitemtotal:宝贝收藏数 favshoptotal:店铺收藏数 transactionshippingtotal:总的成交笔数 transactiontotal:总的成交金额 favtotal:总的收藏总数,包括宝贝收藏和店铺收藏 competition:竞争度 ctr:点击率 roi:投入产出比 cpc:点击花费 coverage:点击转化率 cat_id:类目id cat_name:类目名称 bidword:关键词 函数返回值: { ‘连衣裙’:{‘pv’:1,‘click’:1,‘cpc’:1,‘ctr’:1,‘competition’:1,} ... .. . } """ cat_word_data_dict = {} last_update_time = datetime.datetime.now() for temp_list in genr_sublist(word_list, 100): try: tobj = tsapi.simba_insight_catsworddata_get( cat_id=cat_id, bidword_list=','.join(temp_list), start_date=start_date, end_date=end_date) if hasattr(tobj, "catword_data_list"): catword_data_list = tobj.catword_data_list if hasattr(catword_data_list, "insight_word_data_under_cat_d_t_o"): insight_word_data_under_cat_d_t_o = catword_data_list.insight_word_data_under_cat_d_t_o for cat_data in insight_word_data_under_cat_d_t_o: cpc = getattr( cat_data, "cpc", cat_data.click > 0 and (cat_data.cost / cat_data.click) or 0) try: cat_word_data_dict[cat_data.bidword] = { 'pv': cat_data.impression, 'click': cat_data.click, 'cpc': cpc, 'ctr': cat_data.ctr, 'competition': cat_data.competition, 'last_update_time': last_update_time, 'word': cat_data.bidword } except Exception, e: log.error("get the top error = %s" % e) continue except TopError, e: log.error('get simba_insight_catstopwordnew_get error, e=%s' % (e))
def get_word_subdata(word_list, start_date, end_date): """ 获取关键词按流量进行细分的数据,返回结果中network表示流量的来源,意义如下:1->PC站内,2->PC站外,4->无线站内 5->无线站外 出入参数: bidword_list:以逗号分割的关键词列表 start_date:起始时间 end_date:终止时间 返回值: impression: 展现量 click:点击量 cost: 花费,单位(分) directtransaction: 直接成交金额 indirecttransaction:间接成交金额 directtransactionshipping:直接成交笔数 indirecttransactionshipping:间接成交笔数 favitemtotal:宝贝搜藏数 favshoptotal:店铺搜藏数 transactionshippingtotal:总的成交笔数 transactiontotal:成交总金额 favtotal:总的收藏数,包括宝贝收藏和店铺收藏 competition:竞争度 ctr:点击率 cpc:平均点击花费 roi:投入产出比 coverage:点击转化率 bidword:关键词 network:流量来源:1:PC站内,2:PC站外,4:无线站内 5:无线站外 mechanism:投放机制:0:关键词推广 2:定向推广 3:通用定向 函数返回值: { '连衣裙':{ "pv":1, "click":1, "cost":1, "directtransaction":1, "indirecttransaction":1, "directtransactionshipping":1, "indirecttransactionshipping":1, "favitemtotal":1, "favshoptotal":1, "transactionshippingtotal":1, "transactiontotal":1, "favtotal":1, "competition":1, "ctr":1, "cpc":1, "roi":1, "coverage":1, "network":1, "mechanism":1, }, ‘红色连衣裙’:{ ... }, ... .. . } """ word_sub_data_dict = {} for temp_list in genr_sublist(word_list, 100): try: tobj = tsapi.simba_insight_wordssubdata_get( bidword_list=','.join(temp_list), start_date=start_date, end_date=end_date) if hasattr(tobj, "word_subdata_list"): word_subdata_list = tobj.word_subdata_list if hasattr(word_subdata_list, "insight_word_sub_data_d_t_o"): insight_word_sub_data_d_t_o = word_subdata_list.insight_word_sub_data_d_t_o for sub_data in insight_word_sub_data_d_t_o: word = sub_data.bidword temp_dict = sub_data.__dict__ if word in word_sub_data_dict: word_sub_data_dict[word].append(temp_dict) else: word_sub_data_dict[word] = [temp_dict] except TopError, e: log.error('get simba_insight_wordssubdata_get error, e=%s' % (e)) return {}