Exemplo n.º 1
0
    def check_status(self): # 这里会不会有问题,造成api访问太过频繁超限?

        def is_done(status_dict, task_count):
            if status_dict:
                all_successed = all(status_dict.values())
                if not all_successed:
                    raise WorkerFailed("download-failed")
                else:
                    if len(status_dict) == task_count:
                        return True
                    return False
            else:
                return False

        key_list = [("%s_status" % key) for key in self.key_list]
        status_dict = CacheAdpter.get_many(key_list, self.cache_db)
        has_done = is_done(status_dict, len(self.key_list))

        time_interval = 0.1
        # 还是给一个超时吧,超时就下载失败。(还要考虑全局的流控,比如今天关键词的报表下载完毕)
        timeout = len(self.adgroup_id_list) * 60 # 超时时间依据下载的adgroup个数来计算(最好还有一个最大超时时间)
        st_time = time.time()
        while (not has_done):
            time.sleep(time_interval)
            if time.time() - st_time >= timeout:
                # 记录状态
                log.error("timeout: total %s, finished %s, undone task=%s" % (len(self.key_list), len(status_dict), list(set(self.key_list) - set(status_dict.keys()))))
                raise WorkerTimeout("download-timeout")
            status_dict = CacheAdpter.get_many(key_list, self.cache_db)
            has_done = is_done(status_dict, len(self.key_list))

        CacheAdpter.delete_many(key_list, self.cache_db) # 确认了本次任务的状态之后,清掉这些标准位,以免影响下次的判断
        return True
Exemplo n.º 2
0
 def sum_result(self):
     result_dict = CacheAdpter.get_many(self.key_list, self.cache_db)
     if result_dict:
         rpt_list = reduce(list.__add__, result_dict.values())
         new_rpt_list = KeywordRpt.simply_rpt(rpt_list)
     else:
         new_rpt_list = []
     return new_rpt_list
Exemplo n.º 3
0
 def check_status(self):
     """检查完成进度"""
     key_list = [
         '%s_status' % sub_prj['data_key'] for sub_prj in self.sub_prj_list
     ]
     value_dict = CacheAdpter.get_many(key_list, self.db_name)
     time_interval = 0.3
     time_out = 20
     start_time = time.time()
     ratio = float(len(value_dict)) / len(self.sub_prj_list)
     while ratio < 0.85:
         time.sleep(time_interval)
         if time.time() - start_time >= time_out:
             log.info('waiting for worker finishing time out!')
             break
         value_dict = CacheAdpter.get_many(key_list, self.db_name)
         ratio = float(len(value_dict)) / len(self.sub_prj_list)
     return True
Exemplo n.º 4
0
    def sum_prj_result(self, sub_prj_list, db_name):
        candi_kw_dict = {}
        key_list = []
        for prj in sub_prj_list:
            key_list.append(prj['data_key'])
        log.info('sum project result item_id=%s' % (self.item_id))
        worker_result_dict = CacheAdpter.get_many(key_list, db_name)
        # 汇总农民工的结果数据
        for temp_dict in worker_result_dict.values():
            for k, v in temp_dict.items():
                if not candi_kw_dict.has_key(k):
                    candi_kw_dict[k] = []
                candi_kw_dict[k].extend(v)
        # 汇总排序
        result_list = []
        filter_index = 0
        for filter in self.select_conf.select_conf_list:
            filter_index += 1
            kw_list = candi_kw_dict.get(filter.candi_filter,
                                        [])[0:10000]  # 卡死,某一类别最多10000,绝对够了
            if not kw_list:
                continue
            sort_func = 'kw_list.sort(sort_kwlist_by_%s)' % filter.sort_mode
            eval(sort_func)
            # 根据配置的数目获取
            range_list = filter.select_num.split('-')
            if float(range_list[0]) < 1.0:  # 按照百分比
                start_index = int(len(kw_list) * float(range_list[0]))
                end_index = int(len(kw_list) * float(range_list[1]))
            else:
                start_index = int(range_list[0]) - 1
                end_index = int(range_list[1])
            temp_list = [
                kw + [str(filter_index)]
                for kw in kw_list[start_index:end_index - start_index]
            ]
            result_list.extend(temp_list)

        # result_list = remove_same_words(result_list) # 去除重复关键词 TODO wuhuaqiao 有问题,去重时重新排序,影响原来结果
        log.info('select keyword from kwlib,result=%s' % len(result_list))
        return result_list
Exemplo n.º 5
0
    def get_prj_statu(self, sub_prj_list, db_name):
        key_list = []
        server_dict = {}
        for prj in sub_prj_list:
            if prj['statu'] == 'finished':
                continue
            key_list.append(prj['data_key'] + '_statu')
            server_dict[prj['host'] + ':' + str(prj['port'])] = 1
        log.info('server is working, unfinished server is: %s' %
                 (','.join(server_dict.keys())))
        value_dict = CacheAdpter.get_many(key_list, db_name)

        if not value_dict:
            return

        for prj in sub_prj_list:
            if prj['statu'] == 'finished':
                continue
            value = value_dict.get(prj['data_key'] + '_statu', None)
            if value:
                prj['statu'] = value
Exemplo n.º 6
0
    def sum_result(self):
        """汇总结果"""
        candi_kw_dict = {}
        key_list = []
        for prj in self.sub_prj_list:
            key_list.append(prj['data_key'])
        worker_result_dict = CacheAdpter.get_many(key_list, self.db_name)
        # 汇总农民工的结果数据
        for temp_dict in worker_result_dict.values():
            for k, v in temp_dict.items():
                if not candi_kw_dict.has_key(k):
                    candi_kw_dict[k] = []
                candi_kw_dict[k].extend(v)
        # 汇总排序
        result_list = []
        filter_index = 0
        for filter in self.select_conf.select_conf_list:  # @ReservedAssignment
            filter_index += 1
            kw_list = candi_kw_dict.get(filter.candi_filter,
                                        [])[0:10000]  # 卡死,某一类别最多10000,绝对够了
            if not kw_list:
                continue
            sort_func = 'kw_list.sort(sort_kwlist_by_%s)' % filter.sort_mode
            eval(sort_func)
            # 根据配置的数目获取
            range_list = filter.select_num.split('-')
            if float(range_list[0]) < 1.0:  # 按照百分比
                start_index = int(len(kw_list) * float(range_list[0]))
                end_index = int(len(kw_list) * float(range_list[1]))
            else:
                start_index = int(range_list[0]) - 1
                end_index = int(range_list[1])
            temp_list = [
                kw + [str(filter_index)]
                for kw in kw_list[start_index:end_index - start_index]
            ]
            result_list.extend(temp_list)

        return result_list