def process_page(page, base_url, current_url, output, pc, recursion_limit): output = get_list(page, output, base_url) pc += 1 # page count up 1 if pc <= recursion_limit: next_url = "".join(("http://www.walmart.com/search/?query=cooler&page=",str(pc))) page, base_url = get_page(next_url, 1) output = process_page(page, base_url, next_url, output, pc, recursion_limit) return(output)
def main(): initial_url = "http://mugshots.louisvilleky.gov/archonixxjailsiteslmdc/archonixxjailpublic/" initial_url = "https://www.yell.com/ucs/UcsSearchAction.do?keywords=pizza&location=United+Kingdom&pageNum=8" initial_url = "https://worldgaming.com/tournaments" initial_url = "http://www.walmart.com/search/?query=cooler&page=1" # initial_url = "https://l3com.taleo.net/careersection/l3_ext_us/jobsearch.ftl" # TODO need to clean this up so that I can pass the initial_cookie and initial_url as part of the object init output = {} page, base_url = get_page(initial_url, 1) data = process_page(page, base_url, initial_url, output, 1, 2) process_output(data) print("Job finished. ***************************************")
def get_details(entry): this_page, base_url = get_page(entry['url'], 1) try: entries = this_page.find("strong", {"itemprop": "telephone"}).get_text() # entry['URN'] = entries[0].find("strong").get_text() # entry['street'] = entries[1].contents[0] # entry['town'] = entries[1].contents[2] # entry['parish'] = entries[1].contents[4] # entry['postalcode'] = entries[1].contents[6] entry['Phone'] = entries except Exception as e: print("Failed to get details for %s" % entry['url']) return(entry)
def _get_meta_rate(self, url): rank_page = get_page(url, 1) if rank_page[0].find(string=re.compile(r'There are no customer reviews for this item')): self.product['num_reviews'] = 0 self.product['avg_review'] = 0 else: try: rating = rank_page[0].find("span", {"class":"asinReviewsSummary"}).img.attrs['title'] self.product['num_reviews'] = rating except: self.product['num_reviews'] = 0 try: reviews = rank_page[0].find(string=re.compile(r'\d+ customer reviews')) # TODO: strip out text self.product['avg_review'] = reviews except: self.product['avg_review'] = 0