Пример #1
0
    def get_product_info(self, url, page, category_code):
        url_sanitazed = format_url(url)
        title = self.get_product_title(page)
        primary_image = self.get_product_primary_image(page)
        price = self.get_product_price(page)
        features = self.get_product_features(page)
        extra_info = self.get_product_extra_info(page, category_code)

        if not primary_image:
            log("No product image detected, skipping")

        product = ProductRecord(
            title=title,
            product_url=url_sanitazed,
            listing_url=url_sanitazed, # TODO: delete attr
            price=price,
            primary_img=primary_image,
            crawl_time=None,
            category_code=category_code,
            category=CATEGORY_LABELS[int(category_code)],
            features=features,
            asin=extra_info.get('asin'),
            dimensions=extra_info.get('product_dimensions'),
            weight=extra_info.get('product_weight'),
            shipping_weight=extra_info.get('shipping_weight'),
            package_dimensions=extra_info.get('product_dimensions'),
            package_weight=extra_info.get('package_weight'),
        )
        product_id = product.save()

        if product_id: log('Product saved! {}'.format(product_id))
Пример #2
0
def handle_listing(page, platform, url, sql_handler):
    if platform == 'indeed':
        title = extractors.extract_element_text(
            page, 'h1', {'class': 'jobsearch-JobInfoHeader-title'})
        company = extractors.extract_element_text(
            page, 'div', {'class': 'jobsearch-CompanyReview--heading'})
        if not company:
            company = extractors.extract_element_text(
                page, 'div', {'class': 'icl-u-lg-mr--sm icl-u-xs-mr--xs'})
        job_meta_header = extractors.extract_element_text(
            page, 'span', {'class': 'jobsearch-JobMetadataHeader-item'})
        desc = extractors.extract_element_text(page, 'div',
                                               {'id': 'jobDescriptionText'})
        url = extractors.extract_element_attr_value(page, 'meta',
                                                    {'id': 'indeed-share-url'},
                                                    'content')
        job_id = helpers.get_url_param_value(url, 'jk')
        date = extractors.extract_indeed_job_footer_text(page)
        sql_handler.save_indeed_job(job_id=job_id,
                                    date=date,
                                    company=company,
                                    title=title,
                                    job_meta=job_meta_header,
                                    text=desc,
                                    url=url,
                                    platform=platform)
    if platform == 'twitter':
        next_token = handle_twitter_response(page)
        while next_token:
            token_url = helpers.format_url(
                url, platform, add_param={'pagination_token': next_token})
            page = helpers.make_request(token_url, platform)
            next_token = handle_twitter_response(page)

    if platform == 'Volkswagen_press':
        id = platform + '_' + helpers.get_url_path_element(url, -1)
        title = extractors.extract_element_text(page, 'h1',
                                                {'class': 'page--title'})
        company = "Volkswagen"
        date = extractors.extract_element_text(page, 'div',
                                               {'class': 'meta--item'}, 0)
        date_string = extractors.extract_date_string_from_text(date, platform)
        meta_topics = extractors.extract_child_element_text(
            page, 'div', {'class': 'meta--item'}, 'a', {'content-link': ''}, 2,
            0)
        short_summary = extractors.extract_list_text_by_parent(
            page, 'div', {'class': 'topic-list'})
        summary = extractors.extract_child_element_text(
            page, 'div', {'class': 'page-item--intro'}, 'p', None, 0, 0)
        text = extractors.extract_concatinated_text_by_element(
            page, 'div', {'class': 'page-item--text'}, 'p')
        sql_handler.save_press_release(release_id=id,
                                       company=company,
                                       release_date=date_string,
                                       topics=meta_topics,
                                       url=url,
                                       title=title,
                                       short_summary=short_summary,
                                       summary=summary,
                                       text=text)
Пример #3
0
def fetch_listing():

    global crawl_time
    url = dequeue_url()
    url = url.decode('utf-8')
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    page, html = make_request(url)
    if not page:
        return

    items = page.findAll("div", "s-result-item")
    log("Found {} items on {}".format(len(items), url))

    for item in items[:settings.max_details_per_listing]:

        product_image = get_primary_img(item)
        if not product_image:
            log("No product image detected, skipping")
            continue

        product_title = get_title(item)
        product_url = get_url(item)
        product_price = get_price(item)

        product = ProductRecord(
            title=product_title,
            product_url=format_url(product_url),
            listing_url=format_url(url),
            price=product_price,
            primary_img=product_image,
            crawl_time=crawl_time

        )
        product_id = product.save()
        # download_image(product_image, product_id)

    # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        enqueue_url(next_link["href"])
        pile.spawn(fetch_listing)
Пример #4
0
def fetch_listing():
    '''
    This is the root function that green threads call.
    This is essentially step 1 (but step 0 is above!)
    '''
    global crawl_time

    # Pop a random URL from the Redis listing_url_queue
    url = helpers.dequeue_url()
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    page, html = helpers.make_request(url)
    if not page:
        return
    items = page.findAll("li", "s-result-item")
    log("Found {} items on {}".format(len(items), url))

    for item in items[:settings.max_details_per_listing]:
        product_image = extractors.get_primary_img(item)
        if not product_image:
            log("No product image detected, skipping")
            continue

        product_title = extractors.get_title(item)
        product_url = extractors.get_url(item)
        product_price = extractors.get_price(item)

        product = models.ProductRecord(
            title=product_title,
            product_url=helpers.format_url(product_url),
            listing_url=helpers.format_url(url),
            price=product_price,
            primary_img=product_image,
            crawl_time=crawl_time)
        product_id = product.save()
        helpers.download_image(product_image, product_id)

    # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        helpers.enqueue_url(next_link["href"])
        pile.spawn(fetch_listing)
Пример #5
0
def fetch_listing():

    global crawl_time
    url = dequeue_url()
    if not url:
        log("WARNING: No URLs found in the queue. Retrying...")
        pile.spawn(fetch_listing)
        return

    page, html = make_request(url)
    if not page:
        return

    items = page.findAll("li", "s-result-item")
    log("Found {} items on {}".format(len(items), url))

    for item in items[:settings.max_details_per_listing]:

        product_image = get_primary_img(item)
        if not product_image:
            log("No product image detected, skipping")
            continue

        product_title = get_title(item)
        product_url = get_url(item)
        product_price = get_price(item)

        product = ProductRecord(
            title=product_title,
            product_url=format_url(product_url),
            listing_url=format_url(url),
            price=product_price,
            primary_img=product_image,
            crawl_time=crawl_time

        )
        product_id = product.save()
        # download_image(product_image, product_id)

    # add next page to queue
    next_link = page.find("a", id="pagnNextLink")
    if next_link:
        log(" Found 'Next' link on {}: {}".format(url, next_link["href"]))
        enqueue_url(next_link["href"])
        pile.spawn(fetch_listing)
Пример #6
0
def get_url(item):
    try:
        link_tag = item.select("a.s-access-detail-page")[0]
        if link_tag:
            return format_url(link_tag['href'])
        else:
            return "<missing product url>"
    except IndexError as err:
        return "<missing product url>"
    except Exception as e:
        return "<missing product url>"
Пример #7
0
def fetch_listing(ASIN, marketplace):

    global crawl_time
    url = marketplace.country_host + "/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=" + ASIN
    if not url:
        log("WARNING: No URLs {} found in the queue. Retrying...".format(url))
        #pile.spawn(fetch_listing)
        return

    page, html = make_request(ASIN, marketplace.country_host)
    if not page:
        log("WARNING: No page. Retrying")
        #sleep(3)
        #fetch_listing(ASIN, marketplace)
    if page == None:
        return amazon_api(ASIN, url, marketplace.country_code)
    item = page
    product_image = get_primary_img(item)
    if not product_image:
        log("No product image detected, skipping")
        # continue
    product_title = get_title(item)
    product_url = get_url(item)
    product_price = get_price(item)
    product_indexing = get_indexing(item)
    if (product_title == '<missing product title>'
            and product_url == '<missing product url>'):
        product = amazon_api(ASIN, url, marketplace.country_code)
    else:
        product = ProductRecord(title=product_title,
                                product_url=format_url(product_url),
                                listing_url=format_url(url),
                                price=product_price,
                                primary_img=product_image,
                                product_indexing=product_indexing,
                                crawl_time=crawl_time,
                                asin=ASIN)
    return product
Пример #8
0
def fetch_listing(start, end):
    global crawl_time
    with open('amazon-products.p', 'rb') as pf:
        product_dict = pickle.load(pf)
    index = start - 1
    count = 0
    for product_url in product_urls:
        #print product_url
        index += 1
        page1, html1 = make_request(product_url)
        try:
            # visit the page specified by product_url
            temp_dict = {}
            product_title = product_dict[product_url]
            product_price = page1.find(
                "span", "a-size-medium a-color-price").get_text().strip()

            #extract product info from comparison_table
            table = page1.find(
                "table",
                "a-bordered a-horizontal-stripes a-spacing-mini a-size-base comparison_table"
            )
            for i in table.findAll("tr"):
                if "a-span3 comparison_attribute_name_column comparison_table_first_col" in str(
                        i):
                    k = i.find("td").find("span").get_text()
                    v = i.find("th").find("span").get_text()
                    temp_dict[v] = k

            #extract product info from product details Table
            tables = page1.findAll("table", "a-keyvalue prodDetTable")
            for table2 in tables:
                for i in table2.findAll("tr"):
                    k = i.find("td").get_text().strip()
                    v = i.find("th").get_text().strip()
                    temp_dict[v] = k

            product = ProductRecord(title=product_title,
                                    product_url=format_url(product_url),
                                    price=product_price,
                                    properties=temp_dict)
            product_name = settings.a_products_path + str(index) + ".p"
            pickle.dump(product, open(product_name, 'wb'))
            #print_product(index)
            count += 1
            print(count, index, product_price)
            sys.stdout.flush()
        except Exception as e:
            print "Exception##:" + str(index) + '\t' + str(e)
Пример #9
0
def dump_urls():
    visited = {}
    with open(settings.w_URL_file, 'w') as w:
        while queue:  # while queue is not empty
            url = dequeue_url()
            if not url:
                log("Queue empty")
                return

            if url in visited:  # we've already seen this product
                continue
            else:
                visited[url] = True  # mark that we've seen it
            # need to add host to url
            url = format_url(url, walmart=True)
            w.write('%s\n' % url)
Пример #10
0
def begin_crawl(crawl_more):

    visited = {}
    product_dict = {}
    if crawl_more:

        with open(settings.a_URL_file, 'r') as w:
            urls = (w.readlines())
        for url in urls:
            url = url.strip()
            visited[url] = True

    w = open(settings.a_URL_file, 'a')
    with open(settings.start_file, "r") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue  # skip blank and commented out lines
            page, html = make_request(line)
            url = line
            count = 0
            while page != None and count <= 50:
                items = page.findAll("li", "s-result-item")
                for item in items[:settings.max_details_per_listing]:
                    product_image = get_primary_img(item)
                    if not product_image:
                        continue
                    product_title = get_title(item)
                    product_url = get_url(item)
                    product_price = get_price(item)
                    if product_url not in visited:
                        count += 1
                        print product_url, product_price, product_title
                        visited[product_url] = True  # mark that we've seen it
                        # need to add host to url
                        product_url = format_url(product_url)
                        w.write('%s\n' % product_url)
                        product_dict[product_url] = (product_title,
                                                     product_price)
                        print count, product_url, product_dict[product_url]

                next_link = page.find("a", id="pagnNextLink")
                if next_link:
                    page, html = make_request(next_link["href"])
                    url = next_link["href"]
    w.close()
    pickle.dump(product_dict, open("amazon-products.p", "wb"))
Пример #11
0
def begin_crawl(session):
    # explode out all of our category `start_urls` into subcategories
    with open(settings.w_start_file, "r") as f:
        session = dryscrape.Session()
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue  # skip blank and commented out lines
            url = line

            session.visit(url)
            response = session.body()
            soup = BeautifulSoup(response, "html5lib")
            count = 0

            i = 1  # starting page
            while soup != None:
                print 'page %d of link: %s' % (i, line)
                # look for products listed on this page
                results = soup.findAll('div',
                                       'search-result-gridview-item clearfix'
                                       )  # items in gridview

                for result in results:
                    link = result.find('a')
                    if not link:
                        continue
                    link = link['href']
                    count += 1
                    enqueue_url(link)

                i += 1
                # go to list of pages at bottom
                p_list = soup.find('ul', 'paginator-list').findAll('li')
                for p in p_list:
                    # search for 'next' ordinal page, visit that next page for next iteration of while loop
                    if not p.has_attr('class') and str(i) in p.find('a').text:
                        url = format_url(p.find('a')['href'], walmart=True)
                        session.visit(url)
                        response = session.body()
                        soup = BeautifulSoup(response, "html5lib")
                        break
                    else:
                        soup = None  # if None for all, there is no next page and we can stop searching this link

            log("Found {} results on {}".format(count, line))