示例#1
0
def process_page(page, base_url, current_url, output, pc, recursion_limit):
    output = get_list(page, output, base_url)
    pc += 1  # page count up 1
    if pc <= recursion_limit:
        next_url = "".join(("http://www.walmart.com/search/?query=cooler&page=",str(pc)))
        page, base_url = get_page(next_url, 1)
        output = process_page(page, base_url, next_url, output, pc, recursion_limit)
    return(output)
示例#2
0
def main():
    initial_url = "http://mugshots.louisvilleky.gov/archonixxjailsiteslmdc/archonixxjailpublic/"
    initial_url = "https://www.yell.com/ucs/UcsSearchAction.do?keywords=pizza&location=United+Kingdom&pageNum=8"
    initial_url = "https://worldgaming.com/tournaments"
    initial_url = "http://www.walmart.com/search/?query=cooler&page=1"
    # initial_url = "https://l3com.taleo.net/careersection/l3_ext_us/jobsearch.ftl"
    # TODO need to clean this up so that I can pass the initial_cookie and initial_url as part of the object init
    output = {}
    page, base_url = get_page(initial_url, 1)
    data = process_page(page, base_url, initial_url, output, 1, 2)
    process_output(data)
    print("Job finished. ***************************************")
示例#3
0
def get_details(entry):
    this_page, base_url = get_page(entry['url'], 1)
    try:
        entries = this_page.find("strong", {"itemprop": "telephone"}).get_text()
        # entry['URN'] = entries[0].find("strong").get_text()
        # entry['street'] = entries[1].contents[0]
        # entry['town'] = entries[1].contents[2]
        # entry['parish'] = entries[1].contents[4]
        # entry['postalcode'] = entries[1].contents[6]
        entry['Phone'] = entries
    except Exception as e:
        print("Failed to get details for %s" % entry['url'])
    return(entry)
示例#4
0
 def _get_meta_rate(self, url):
     rank_page = get_page(url, 1)
     if rank_page[0].find(string=re.compile(r'There are no customer reviews for this item')):
         self.product['num_reviews'] = 0
         self.product['avg_review'] = 0
     else:
         try:
             rating = rank_page[0].find("span", {"class":"asinReviewsSummary"}).img.attrs['title']
             self.product['num_reviews'] = rating
         except:
             self.product['num_reviews'] = 0
         try:
             reviews = rank_page[0].find(string=re.compile(r'\d+ customer reviews'))  # TODO: strip out text
             self.product['avg_review'] = reviews
         except:
             self.product['avg_review'] = 0