Exemplo n.º 1
0
 def add_store(self,
               name,
               plt_name,
               plt_store_id,
               login_username=None,
               url=None,
               status=1,
               properties=None):
     """
     增加店铺
     :param name: 店铺名
     :param plt_name: 平台名
     :param plt_store_id: 平台店铺id
     :param login_username: 登录名
     :param url: 登录地址
     :param status: 店铺状态,设置为1,代表有效;设置为0,代表无效
     :param properties: 店铺属性列表,该参数需要传入二维数组
     :return:
     """
     if self.check_store_name_exists(name):
         Logging.error('add_store:', name, '店铺名已存在!')
         return None
     key = StoreDao().insert(name, plt_name, plt_store_id, login_username,
                             url, status)
     if properties:
         for x in properties:
             StorePropertyDao().insert(key, x[0], x[1], x[2], x[3])
     store = self.get_store(key)
     return store
Exemplo n.º 2
0
 def check_store_login(self):
     self.driver.get('https://sycm.taobao.com')
     Time.sleep(1)
     current_url = self.driver.current_url
     if 'login.htm' in current_url:
         Logging.error('store:', self.store.name, 'current_url:',
                       current_url, '店铺未登录,无法继续取数!')
         self.error = ErrorEnum.ERROR_1004
         raise Exception('store:', self.store.name, '店铺未登录,无法继续取数!')
     self.login_flag = Time
Exemplo n.º 3
0
 def _get_data_tabs(self, page_data_id):
     data_tabs = []
     data = DataTabDao().query_by_page_data_id(page_data_id)
     if data:
         for row in data:
             data_tab = self._get_data_tab(row[0])
             data_tabs.append(data_tab)
         return data_tabs
     else:
         Logging.error('不存在该page_data_id:', page_data_id)
Exemplo n.º 4
0
 def _get_data_tab(self, tab_id):
     data = DataTabDao().query(tab_id)
     if data:
         data = data[0]
         data_tab_column_entity = self._get_data_tab_columns(tab_id)
         data_tab = DataTabEntity(data[0], data[1], data[2], data[3],
                                  data[4], data[5], data[6], data[7],
                                  data_tab_column_entity)
         return data_tab
     else:
         Logging.error('不存在该tab_id:', tab_id)
Exemplo n.º 5
0
 def _get_data_tab_columns(self, tab_id):
     data = DataTabColumnDao().query_by_tab_id(tab_id)
     if data:
         data_tab_columns = []
         for row in data:
             data_tab_columns.append(
                 DataTabColumnEntity(row[0], row[1], row[2], row[3], row[4],
                                     row[5], row[6], row[7], row[8], row[9],
                                     row[10], row[11]))
         return data_tab_columns
     else:
         Logging.error('不存在该data_tab_id:', tab_id)
Exemplo n.º 6
0
 def get_page(self, page_id):
     """
     获取需要抓取的页面信息
     :param page_id: 页面id
     :return: page实体对象
     """
     data = PageDao().query(page_id)
     if data:
         data = data[0]
         page = PageEntity(data[0], data[1], data[2], data[3], data[4],
                           data[5], data[6], data[7], data[8])
         return page
     else:
         Logging.error('page_id:', page_id, ' 不存在!')
Exemplo n.º 7
0
 def _locate_page(self):
     """
     定位到指定取数的页面
     :param url: 指定抓取页面的url
     :return: True/False
     """
     try:
         self.driver.get(self.page.url)  # 第一次请求到达平台默认页
         self.driver.close(self.page.url)
         self.driver.get(self.page.url)  # 第二次请求是为了到达指定的爬虫页
     except Exception as e:
         Logging.error(e)
         self.error = ErrorEnum.ERROR_3001
         return False
     return True
Exemplo n.º 8
0
 def __init__(self, store_id, page_data_id, port):
     """
     初始化爬虫任务所需的信息
     1.实例化对象:Store、PageData、Table
     2.环境初始化
     3.web_driver 连接确认
     4.web_driver 店铺LOGIN确认,确认浏览正常并店铺已登录成功时置login_flag=True
     :param store_id: 店铺id,用来获取店铺对象
     :param page_data_id: 抓取的页面数据块id,用来获取页面数据块对象
     :param port: 已开启的浏览器服务端口
     """
     self.error = None
     self.login_flag = False
     try:
         self.store = StoreService().get_store(store_id)
         self.page_data = PageDataService().get_page_data(page_data_id)
         self.page = self.page_data.page
         self.db = DataBase()
         self.port = port
         self.FILE_PART_PATH = self.store.name + '/' + self.page_data.name + '/' + self.page_data.data_update_freq
         self.FILE_DOWNLOAD_PATH = setting.FILE_DOWNLOAD_PATH_PREFIX + '/' + self.store.name
         self.FILE_PROCESS_PATH = setting.FILE_PROCESS_PATH_PREFIX + '/' + self.FILE_PART_PATH
         self.FILE_BACKUP_PATH = setting.FILE_BACKUP_PATH_PREFIX + '/' + self.FILE_PART_PATH
         if not os.path.exists(self.FILE_DOWNLOAD_PATH):
             os.makedirs(self.FILE_DOWNLOAD_PATH)
         if not os.path.exists(self.FILE_PROCESS_PATH):
             os.makedirs(self.FILE_PROCESS_PATH)
         if not os.path.exists(self.FILE_BACKUP_PATH):
             os.makedirs(self.FILE_BACKUP_PATH)
         # 下载目录清理
         self.clear_download_path()
         # 初始化webdriver,判断是否已登录
         self.driver = None
         self.init_web_driver()
         self.check_store_login()
         # 数据维度字典
         self.data_dimension_dict = {}
         # 下载文件取数时需要
         self.file_names = []
         # 单文件、单数据表存储,例:[DataFrame]
         # 多文件/多sheet、单数据表存储,例:[DataFrame, DataFrame, DataFrame] # TODO 暂无忽略
         # 多文件/多sheet、多数据表存储:判断条件 page_data.is_multiple_tab()
         # 例:[{'tab.name', [DataFrame]}, {'tab.name', [DataFrame, DataFrame]}]
         self.source_data_list = []
         self.data_list = []
     except Exception as e:
         Logging.error(e)
         self.error = ErrorEnum.ERROR_1000
Exemplo n.º 9
0
 def get_store(self, store_id):
     """
     获取单个店铺对象
     :param store_id: 店铺id
     :return: 店铺实体对象
     """
     data = StoreDao().query(store_id)
     if data:
         data = data[0]
         property_entity = self._get_store_properties(store_id)
         store = StoreEntity(data[0], data[1], data[2], data[3], data[4],
                             data[5], data[6], data[7], data[8],
                             property_entity)
         return store
     else:
         Logging.error('店铺id不存在:', store_id)
Exemplo n.º 10
0
 def _get_store_properties(self, store_id):
     """
     获取店铺的属性列表
     :param store_id: 店铺id
     :return: 返回该店铺所有属性组成的二维数组
     """
     data = StorePropertyDao().query_by_store_id(store_id)
     if data:
         store_properties = []
         for row in data:
             store_properties.append(
                 StorePropertyEntity(row[0], row[1], row[2], row[3], row[4],
                                     row[5], row[6], row[7]))
         return store_properties
     else:
         Logging.error('该店铺id不存在:', store_id)
Exemplo n.º 11
0
 def operation_data_input(self):
     """
     将读取到的data_frame按照字段名写入到数据库
     """
     try:
         df = self.data_list[0]
         file_col_names = tuple(df.columns.tolist())
         data_list = list(df.itertuples(
             index=False, name=None))  # 将data_frame每一行转化为元组放入列表中
         insert_sql = "insert into {} {} values (%s{})".format(
             self.page_data.data_tabs[0].name, file_col_names,
             ',%s' * (df.shape[1] - 1))
         self.db.insert_many(insert_sql, data_list)
         self.db.commit()
     except Exception as e:
         Logging.error(e)
         self.error = ErrorEnum.ERROR_5002
Exemplo n.º 12
0
 def __init__(self, name, param={}):
     """
     对象/任务实例化
     :param name: 对象标识,规则:从目录至最终对象,handle.xxx.Obj
     :param param: 对象实例化参数,类型:dict
     """
     self.error = None
     self.obj = None
     self.obj_name = name
     self.obj_param = param
     try:
         Logging.info(self.obj_name, self.obj_param, ' 实例化 start!')
         if self.obj_name == 'handle.task_creator.TaskCreator':
             self.obj = TaskCreator()
         elif self.obj_name == 'handle.login.tb_login.TaoLogin':
             try:
                 self.obj = tb_login()
             except Exception as e:
                 Logging.error(e)
                 self.error = ErrorEnum.ERROR_2000
         # ========================== 抓取页面实例配置 START ==========================
         elif self.obj_name == 'handle.website.subway.report.SubReportDay':
             self.obj = SpreadReportDay(self.obj_param['store_id'],
                                        self.obj_param['page_data_id'],
                                        self.obj_param['port'])
         elif self.obj_name == 'handle.website.subway.direct_report.SpreadReportDay':
             self.obj = SpreadReportDay1(self.obj_param['store_id'],
                                         self.obj_param['page_data_id'],
                                         self.obj_param['port'])
         # ========================== 抓取页面实例配置 END ==========================
         else:
             self.error = ErrorEnum.ERROR_9001
             self.error.value.set_msg(('未匹配到任务实例 name:' + self.obj_name +
                                       ',param:' + self.obj_param))
         if self.is_success():
             Logging.info(self.obj_name, self.obj_param, ' 实例化成功 end!')
         else:
             Logging.info(self.obj_name, self.obj_param, ' 实例化失败 error:',
                          self.error, ' end!')
     except Exception as e:
         Logging.error(e)
         if self.is_success() and self.obj and self.obj.error:
             self.error = self.obj.error
         elif self.is_success():
             self.error = ErrorEnum.ERROR_9999
Exemplo n.º 13
0
    def operation_data_process(self):
        """
        解析处理数据
        :return: True/False
        """
        try:
            # 从数据库读取目标表的所有字段名
            check_field_names = []  # 存储需要进行比对的字段名
            db_field_names = []  # 存储数据库中表的所有字段名
            default_add_field = []  # 存储默认需要添加的字段名
            for data_tab_column in self.page_data.data_tabs[
                    0].data_tab_columns:
                if data_tab_column.check_col_name is not None:
                    check_field_names.append(data_tab_column.check_col_name)
                if data_tab_column.check_col_name is None:
                    default_add_field.append(data_tab_column.col_name)
                db_field_names.append(data_tab_column.col_name)
            # check_field_names.sort(reverse=True)
            #
            df = self.source_data_list[0]  # 取出读取到的data_frame

            #添加默认字段并赋值
            df = pd.concat(
                [df, pd.DataFrame(columns=self.default_add_field)], sort=False)
            df['店铺id'] = self.store.id
            df['店铺名'] = self.store.name
            df['日期'] = df['_日期']
            df['文件路径'] = self.FILE_BACKUP_PATH
            df['文件sheet'] = 'sheet'
            df['转化周期'] = '15天累计数据'
            df['报表类型'] = '宝贝'
            df['入库时间'] = get_current_timestamp()
            df['取数时间'] = get_current_timestamp()
            file_col_names = df.columns.tolist()
            # 比较文件数据中的字段与数据库表中字段的差异
            # 多出或者减少的字段需处理到告警信息中
            increase_field = list(set(file_col_names) - set(check_field_names))
            reduce_field = list(set(check_field_names) - set(file_col_names))
            self.data_list.append(df)
        except Exception as e:
            Logging.error(e)
            self.error = ErrorEnum.ERROR_5001
            return False
        return True
Exemplo n.º 14
0
    def init_web_driver(self):
        """
        根据端口获取浏览器driver
        :return: True/False
        """
        try:
            chrome_options = Options()
            chrome_options.add_experimental_option(
                "debuggerAddress", "127.0.0.1:{}".format(self.port))
            self.driver = webdriver.Chrome(chrome_options=chrome_options)

            Logging.info('{} - Chrome[{}]连接成功。'.format(self.store.name,
                                                       self.port))
        except Exception as e:
            print(e)
            Logging.error('port:{} 无法接管浏览器'.format(self.port))
            self.error = ErrorEnum.ERROR_1003
            raise Exception
        return True
Exemplo n.º 15
0
 def get_page_data(self, page_data_id):
     """
     获取页面数据块信息
     :param page_data_id: 页面数据块id
     :return: 页面数据块的实体对象
     """
     data = PageDataDao().query(page_data_id)
     if data:
         data = data[0]
         page_data_confs = self._get_page_data_confs(page_data_id)
         page = PageService().get_page(data[1])
         data_tabs = self._get_data_tabs(page_data_id)
         page_data = PageDataEntity(data[0], data[1], data[2], data[3],
                                    data[4], data[5], data[6], data[7],
                                    data[8], data[9], data[10],
                                    page_data_confs, page, data_tabs)
         return page_data
     else:
         Logging.error('不存在该page_data_id:', page_data_id)
     return None
Exemplo n.º 16
0
 def operation_page(self):
     """
     报表条件筛选
     """
     try:
         start_date, end_date = get_day_report_rule1()
         # 各控件筛选操作
         self._operator_name_control()
         self._operator_time_control(start_date, end_date)
     except Exception as e:
         Logging.error(e)
         self.error = ErrorEnum.ERROR_3002
     try:
         download_url = 'https://subway.simba.taobao.com/#!/report/bpreport/download'
         self.web_driver.get(download_url)
         # 获取总页数
         page_num = self.web_driver.find_element_in_xpath(
             '//*[@id="brix_brick_291"]/div[2]/div[2]/span[2]').text
         file_name = 'RPA' + date_to_string(get_current_timestamp(),
                                            '%Y%m%d%H%M%S')
         for x in range(page_num):
             download_url = 'https://subway.simba.taobao.com/#!/report/bpreport/download' + '?page={}'.format(
                 x)
             self.web_driver.get(download_url)
             if self.web_driver.find_element_in_xpath(
                     '//*[@id="brix_brick_334"]/tbody//td[contain(text(), "{}")]'
                     .format(file_name)):
                 self.web_driver.find_element_in_xpath(
                     '//*[@id="brix_brick_334"]/tbody//td[contains(text(), "{}")]/../td/a[contains(@class, "mr10")]'
                     .format(file_name)).click()
                 time.sleep(5)
                 break
         self.wait_download_finish()
     except Exception as e:
         Logging.error(e)
         self.error = ErrorEnum.ERROR_3003
Exemplo n.º 17
0
def worker_task_run():
    tc = TaskController('handle.task_creator.TaskCreator')
    job_id, store_id, page_data_ids = tc.run('get_task')
    while job_id:
        flag = tc.run('task_set_start', {'job_id': job_id})
        # 任务获取成功
        if not flag:
            # 继续获取任务
            Logging.info('job:', job_id, store_id, page_data_ids,
                         ' 任务领取慢了一拍,继续获取其他任务!')
            job_id, store_id, page_data_ids = tc.run('get_task')
            continue
        try:
            port = None
            for page_data_id in page_data_ids:
                # step1:Worker:取数-初始化任务
                param = {
                    'store_id': store_id,
                    'page_data_id': page_data_id,
                    'port': port,
                    'job_id': job_id
                }
                task = TaskController(
                    'handle.website.subway.report.SubReportDay', param)
                # 店铺未登录
                if not task.obj.login_flag:
                    # step2:Worker:取数-登录操作
                    login_tc = TaskController('handle.login.tb_login.TaoLogin',
                                              task.store)
                    login_tc.run('run')
                    if login_tc.is_success():
                        port = login_tc.port
                        param['port'] = port
                        task = TaskController(
                            'handle.website.subway.report.SubReportDay', param)
                    else:
                        Logging.error('param:', param, '登录失败!')
                        raise Exception('param:', param, '登录失败!')
                if not task.is_success():
                    Logging.error('param:', param, '任务初始化失败!')
                    raise Exception('param:', param, '任务初始化失败!')
                try:
                    # step3:Worker:取数-页面操作
                    task.run('operation_page')
                    if not task.is_success():
                        Logging.error('param:', param, '取数-页面操作失败!')
                        raise Exception('param:', param, '取数-页面操作失败!')
                    # step4:Worker:取数-页面文件下载及读取
                    task.run('operation_page_download')
                    if not task.is_success():
                        Logging.error('param:', param, '取数-页面文件下载及读取失败!')
                        raise Exception('param:', param, '取数-页面文件下载及读取失败!')
                    # step5:Worker:取数-数据处理
                    task.run('operation_data_process')
                    if not task.is_success():
                        Logging.error('param:', param, '取数-数据处理失败!')
                        raise Exception('param:', param, '取数-数据处理失败!')
                    # step6:Worker:取数-数据入库
                    task.run('operation_data_input')
                    if not task.is_success():
                        Logging.error('param:', param, '取数-数据入库失败!')
                        raise Exception('param:', param, '取数-数据入库失败!')
                    # step7:Worker:取数-数据备份
                    task.run('operation_data_backup')
                    if not task.is_success():
                        Logging.error('param:', param, '取数-数据备份失败!')
                        raise Exception('param:', param, '取数-数据备份失败!')
                except Exception as e:
                    Logging.error(e)
                    Logging.error('param:', param, ' 页面取数过程失败!')
                tc.run('task_set_end', {'job_id': job_id, 'result': 'success'})
        except Exception as e:
            Logging.error(e)
            Logging.error('job_id:', job_id, ' 任务执行失败!')
            tc.run('task_set_end', {'job_id': job_id, 'result': 'fail'})
        # 继续获取任务
        job_id, store_id, page_data_ids = tc.run('get_task')
Exemplo n.º 18
0
 def wait_download_finish(self, file_type=None):
     """
     根据文件前缀规则匹配,文件是否下载完成
     :param file_type:
     :return:
     """
     # 文件下载超时3分钟
     timeout_num = 180
     while timeout_num >= 0:
         # 匹配到的文件数量
         match_file_cnt = 0
         files = os.listdir(self.FILE_DOWNLOAD_PATH)
         for file in files:
             file_path = os.path.join(self.FILE_DOWNLOAD_PATH, file)
             # 文件下载中,文件后缀
             if '.crdownload' in file or '.tmp' in file:
                 Time.sleep(1)
                 timeout_num = timeout_num - 1
                 continue
             match_file_cnt = 0
             if self.page_data.rule_read_file_prefix is None and os.path.isfile(
                     file_path):
                 match_file_cnt = match_file_cnt + 1
             elif file.find(self.page_data.rule_read_file_prefix
                            ) == 0 and os.path.isfile(file_path):
                 match_file_cnt = match_file_cnt + 1
         if match_file_cnt == 0:
             Time.sleep(1)
             timeout_num = timeout_num - 1
             continue
         elif match_file_cnt == 1:
             self.file_names.append(file)
             # 将文件移到处理目录
             if self.page_data.rule_save_path_suffix is None:
                 file_process_path = self.FILE_PROCESS_PATH
             else:
                 path_suffix = self.page_data.rule_save_path_suffix
                 for key in self.data_dimension_dict.keys():
                     path_suffix = path_suffix.replace(
                         key, self.data_dimension_dict[key])
                 file_process_path = self.FILE_PROCESS_PATH + '/' + path_suffix
                 if not os.path.exists(file_process_path):
                     os.makedirs(file_process_path)
             remote_path = os.path.join(file_process_path, file)
             # TODO 目标文件已存在文件需重命名,时间戳.原文件名
             if os.path.exists(remote_path):
                 os.remove(remote_path)
             shutil.move(file_path, remote_path)  # 移动文件
             Logging.info("move %s -> %s" % (file_path, remote_path))
             # 文件读取
             # TODO 解压文件操作,多文件、多sheet操作
             # TODO 通用需要文件类型配置,常规文件类型支持
             if file_type is None:
                 if file[-3:] == 'csv':
                     file_type = 'csv'
                 elif file[-3:] == 'xls' or file[-4:] == 'xlsx':
                     file_type = 'excel'
             if file_type == 'excel':
                 df = pd.read_excel(remote_path)
             elif file_type == 'csv':
                 df = pd.read_csv(remote_path)
             else:
                 Logging.error('解析文件类型,未找到!')
                 raise Exception('解析文件类型,未找到!')
             self.source_data_list.append(df)
             return True
         else:
             raise Exception('文件下载失败')
     return False
Exemplo n.º 19
0
 def run(self, func, param={}):
     """
     对象任务执行调度控制模板
     :param func:
     :return:
     """
     results = None
     try:
         Logging.info(self.obj_name, func, param, ' 步骤执行 start!')
         if self.obj_name == 'handle.task_creator.TaskCreator' and func == 'task_init':
             results = self.obj.task_init()
         elif self.obj_name == 'handle.task_creator.TaskCreator' and func == 'task_added':
             results = self.obj.task_added()
         elif self.obj_name == 'handle.task_creator.TaskCreator' and func == 'get_task':
             results = self.obj.get_task()
         elif self.obj_name == 'handle.task_creator.TaskCreator' and func == 'task_finish':
             results = self.obj.task_finish()
         elif self.obj_name == 'handle.task_creator.TaskCreator' and func == 'task_set_start':
             results = self.obj.task_set_start(param)
         elif self.obj_name == 'handle.task_creator.TaskCreator' and func == 'task_set_end':
             results = self.obj.task_set_end(param)
         elif self.obj_name == 'handle.login.tb_login.TaoLogin' and func == 'run':
             results = self.obj.run(param)
         elif self.obj_name.find(
                 'handle.website') == 0 and func == 'operation_page':
             try:
                 results = self.obj.operation_page()
             except Exception as e:
                 Logging.error(e)
                 self.error = ErrorEnum.ERROR_3000
         elif self.obj_name.find(
                 'handle.website'
         ) == 0 and func == 'operation_data_process':
             try:
                 results = self.obj.operation_data_process()
             except Exception as e:
                 Logging.error(e)
                 self.error = ErrorEnum.ERROR_4000
         elif self.obj_name.find(
                 'handle.website') == 0 and func == 'operation_data_input':
             try:
                 results = self.obj.operation_data_input()
             except Exception as e:
                 Logging.error(e)
                 self.error = ErrorEnum.ERROR_5000
         elif self.obj_name.find(
                 'handle.website') == 0 and func == 'operation_data_backup':
             try:
                 results = self.obj.operation_data_backup()
             except Exception as e:
                 Logging.error(e)
                 self.error = ErrorEnum.ERROR_6000
         else:
             self.error = ErrorEnum.ERROR_9002
             self.error.value.set_msg(
                 ('未匹配到任务func name:' + self.obj_name + ',func:' + func))
         if self.is_success():
             Logging.info(self.obj_name, func, param, ' 步骤执行成功 end!')
         else:
             Logging.info(self.obj_name, func, param, ' 步骤执行失败 error:',
                          self.error, ' end!')
     except Exception as e:
         Logging.error(e)
         if self.is_success() and self.obj and self.obj.error:
             self.error = self.obj.error
         elif self.is_success():
             self.error = ErrorEnum.ERROR_9999
         raise Exception
     return results