def __init__(self,target_name, base_url, log_toggle=True, page_param='', cate_param='', cate_list=[], cate_list_path='', page_list=[], cate_delay=0, page_delay=0): self.target_name = target_name self.base_url = base_url self.log_toggle = log_toggle self.cate_param = cate_param self.page_param = page_param self.cate_list = cate_list self.page_list = page_list if cate_list_path: self.cate_list = crawlib.get_list_from_fp(cate_list_path) # self.cate_delay = cate_delay # self.page_delay = page_delay #logging if self.log_toggle: logging.basicConfig(filename=self.target_name+".log",level=logging.INFO, format='%(asctime)s %(message)s') logging.info("------\tMain Start\t------")
def crawling_cate_list(self, cate_list=[], cate_list_path='', page_list=[], cate_delay=0, page_delay=0): if cate_list_path: self.cate_list = crawlib.get_list_from_fp(cate_list_path) if not (self.cate_list or cate_list): print("not exist cate_list, input cate_list") # update if exist argment if cate_list: self.cate_list = cate_list # update if exist argment if page_list: self.page_list = page_list for cate in self.cate_list: # logging self.crawling_page_list(cate, self.page_list, page_delay=page_delay) time.sleep(cate_delay)