def start_info_collection(folder_name): """ :param folder_name: category_folder name :return: None """ queue_path = '{}{}{}'.format( DataCollectors_Configuration.LINIO_MEX_URL_ROOT, DataCollectors_Configuration.PATH_STYLE, folder_name) queue_files = get_all_files(queue_path, DataCollectors_Configuration.PATTERN_3) completed_path = '{}{}{}'.format( DataCollectors_Configuration.LINIO_MEX_INFO_ROOT, DataCollectors_Configuration.PATH_STYLE, folder_name) completed_files = get_all_files(completed_path, DataCollectors_Configuration.PATTERN_4) completed_set = set() for files in completed_files: if os.path.exists(files): links = file_to_set(files) for link in links: completed_set.add(link) # print link for files in queue_files: if os.path.exists(files): for url in file_to_set(files): # print(url) url_split = url.split('|') hierarchy_path = '/'.join(url_split[2:-1]) if url in completed_set: # print url pass else: completed_path = '{}{}{}'.format( DataCollectors_Configuration.LINIO_MEX_INFO_ROOT, DataCollectors_Configuration.PATH_STYLE, hierarchy_path) if os.path.exists(completed_path): pass else: create_project_dir(completed_path) urls_queue.put(url) # get_product_urls(line) urls_queue.join()
def start_url_collection(folder_name): """ :param folder_name: category_folder name :return: None """ queue_path = '{}{}{}'.format(DataCollectors_Configuration.LINIO_MEX_HIERARCHY_ROOT, DataCollectors_Configuration.PATH_STYLE, folder_name) queue_files = get_all_files(queue_path, DataCollectors_Configuration.PATTERN_1) completed_path = '{}{}{}'.format(DataCollectors_Configuration.LINIO_MEX_URL_ROOT, DataCollectors_Configuration.PATH_STYLE, folder_name) completed_files = get_all_files(completed_path,DataCollectors_Configuration.PATTERN_2) completed_set = set() for files in completed_files: if os.path.exists(files): links = file_to_set(files) for link in links: completed_set.add(link) # print link for files in queue_files: if os.path.exists(files): for url in file_to_set(files): url_split = url.split('|') last_page = int(url_split[-2]) page_url = url_split[-1] hierarchy = '|'.join(url_split[0:-2]) hierarchy_path = '/'.join(url_split[2:-2]) for page_no in range(1, last_page+1): modified_url = '{}?page={}'.format(page_url, page_no) line = '{}|{}'.format(hierarchy, modified_url) if line in completed_set: pass else: completed_path = '{}{}{}'.format(DataCollectors_Configuration.LINIO_MEX_URL_ROOT, DataCollectors_Configuration.PATH_STYLE, hierarchy_path) if os.path.exists(completed_path): pass else: create_project_dir(completed_path) urls_queue.put(line) # get_product_urls(line) urls_queue.join()
def boot() -> None: create_project_dir(Spider.project_name) create_data_files(Spider.project_name, Spider.base_url) Spider.queue = file_to_set(Spider.queue_file) Spider.crawled = file_to_set(Spider.crawled_file)
def create_jobs() -> None: for link in file_to_set(QUEUE_FILE): queue.put(link) queue.join() crawl()
def crawl() -> None: queued_links = file_to_set(QUEUE_FILE) if len(queued_links) > 0: print(str(len(queued_links)) + " Links in the queue ") create_jobs()