def process(self, tree_path, page_file): assert len(tree_path) > 0 dir_path = os.path.join(self.__dst_dir_path, _convert_tree_path_to_dir_path(tree_path)) lenient_makedir(dir_path) error_page_path = os.path.join(dir_path, "error.txt") _handle_error_page(page_file, error_page_path)
def process(self, tree_path, page_file): assert len(tree_path) > 0 dir_path = os.path.join(self.__dst_dir_path, _convert_tree_path_to_dir_path(tree_path[:-1])) lenient_makedir(dir_path) error_page_path = os.path.join(dir_path, tree_path[-1] + "-error.txt") _handle_error_page(page_file, error_page_path) file_path = os.path.join(dir_path, tree_path[-1] + ".html") self.__download_page(page_file, file_path)
def process(self, tree_path, page_file): assert len(tree_path) > 0 dir_path = os.path.join(self.__dst_dir_path, _convert_tree_path_to_dir_path(tree_path[:-1])) lenient_makedir(dir_path) error_page_path = os.path.join(dir_path, tree_path[-1]+"-error.txt") _handle_error_page(page_file, error_page_path) file_path = os.path.join(dir_path, tree_path[-1]+".html") self.__download_page(page_file, file_path)
def __init__(self, navigators, sentinel, activity_schedule=None, log_file_path=None, state_file_path=None, save_period=None, logging_level=logging.ERROR): """ @param navigators: list of navigators to be used by the crawler. Each navigator will be run in a separate thread, thus the number of the threads is equal to the number of navigators. @type navigators: list of L{AbstractTreeNavigator}s @param sentinel: a technical node which will be made parent of the root node. @type sentinel: L{AbstractNode} @param activity_schedule: if C{None}, no schedule is used and the program works until it finishes crawling. @type activity_schedule: L{AbstractActivitySchedule} @param log_file_path: path to the log file. If C{None}, no log file will be used. @param state_file_path: path to the file where the state of the program will be saved. If C{None}, the state will not be saved. @param save_period: time between saving the tree state. If C{state_file_path} is C{None}, this value is ignored. @param logging_level: one of the logging level constants from C{logging} """ if log_file_path is not None: lenient_makedir(os.path.dirname(log_file_path)) if state_file_path is not None: if os.path.exists(state_file_path): print "State file already exists. Loading the tree from this "\ "file and changing nodes with state PROCESSING to OPEN ... ", self.__load_state_file(state_file_path, sentinel) print "Done." else: lenient_makedir(os.path.dirname(state_file_path)) self.__tree = RWLockTreeAccessor(sentinel) self.__navigators = navigators self.__manager = None self.__state_file_path = state_file_path self.__save_period = save_period self.__activity_schedule = activity_schedule if activity_schedule is None: self.__activity_schedule = AlwaysActiveSchedule() self.__logging_level = logging_level self.__log_file_path = log_file_path
def create(self, args): lenient_makedir(args.destination_dir) return LevelsCreator(args.destination_dir).create()