def worker(dept_code, pages): logging.info("Crawl Dep=%s, page start=%s, page end=%s, page count=%s" % (dept_code, pages[0], pages[-1], len(pages))) dept_dir = "%s/%s" % (EXTRACT_DETAIL_DIR, dept_code) if not os.path.isdir(dept_dir): os.makedirs(dept_dir) ts_crawler = TSCrawler(proxy_host=random.choice(proxies)) logging.info("Crawl department %s" % dept_code) form_response = ts_crawler.submit_form_by_dept(dept_code) for page in pages: logging.info("Department=%s, page=%s" % (dept_code, page)) listing_filename = "%s/%s/listing-%s-%s.html" % (EXTRACT_DETAIL_DIR, dept_code, dept_code, page) if os.path.isfile(listing_filename): continue if page != 0: form_response = ts_crawler.get_listing_page(page) form_html = form_response.read() data = list(parse_listing(form_html)) # Crawl detail for idx, _ in enumerate(data): detail_filename = "%s/%s/avantage-%s-%s-%s.html" % (EXTRACT_DETAIL_DIR, dept_code, dept_code, page, idx) if os.path.isfile(detail_filename): continue with open(detail_filename, "w") as detail_file: detail_response = ts_crawler.get_detail(idx) if detail_response: detail_file.write(detail_response.read()) with open(listing_filename, 'w') as tmp_out: tmp_out.write(form_html) logging.info("Departement=%s is finished" % dept_code)
def get_dept_remaining_tasks(dept_code): first_listing_page_filename = "%s/%s/listing-%s-0.html" % (EXTRACT_DETAIL_DIR, dept_code, dept_code) if not os.path.isfile(first_listing_page_filename): crawler = TSCrawler() response = crawler.submit_form_by_dept(dept_code) dept_dir = "%s/%s/" % (EXTRACT_DETAIL_DIR, dept_code) if not os.path.isdir(dept_dir): os.makedirs(dept_dir) with open(first_listing_page_filename, 'w') as output: output.write(response.read()) first_listing_page = open(first_listing_page_filename, 'r') count, count_per_page = parse_listing_count_and_count_per_page(first_listing_page) if count_per_page == 0: pages_to_crawl = [] else: pages_to_crawl = range(0, int(count / count_per_page) + 1) print "Dep=%s , total pages to crawl=%s" % (dept_code, len(pages_to_crawl)) already_crawled_listings = glob.glob("%s/%s/listing-%s-*.html" % (EXTRACT_DETAIL_DIR, dept_code, dept_code)) if pages_to_crawl: for name in already_crawled_listings: page = int(re.search("listing-\d{1,3}[A,B]?-(\d{1,5}).html", name).groups()[0]) try: pages_to_crawl.remove(page) except ValueError: print name print "Dep=%s , remaining pages to crawl=%s" % (dept_code, len(pages_to_crawl)) return pages_to_crawl