示例#1
0
文件: views.py 项目: Adyg/crawldemo
def home(request):
    data = {
        'url': ''
    }

    if request.method == 'POST':
        data['url'] = request.POST.get('url', 'http://www.toysrus.com/product/index.jsp?productId=24447876')

        print data['url']

        json_result = diffbot.product(data['url'], token=settings.DIFFBOT_TOKEN)
        print json_result
        data['product'] = json_result['objects'][0]
        product_id_bits = data['url'].split('productId=')
        product_id_bits = product_id_bits[1].split('&')
        data['product_id'] = product_id_bits[0]


    return render(request, 'home.html', data)
示例#2
0
    def handle(self, *args, **options):
        # make sure file option is present
        if options['pid'] == None :
            raise CommandError("Option `--pid=...` must be specified.")

        if options['format'] == None :
            raise CommandError("Option `--format=...` must be specified (json or xml).")

        pids = options['pid']

        for pid in pids:
            #diffbot
            url = 'http://www.toysrus.com/product/index.jsp?productId=%s' % pid

            json_result = diffbot.product(url, token=settings.DIFFBOT_TOKEN)
            data_product = json_result['objects'][0]

            sku = data_product['specs']['sku']
            title = data_product['title']

            #start retrieving reviews

            #pid encoding
            str_pid = str(pid)
            s = 0
            i = 0

            for char in pid:
              r = ord(char)
              r = r * (255 - r)
              s = s + r

            s = s % 1023
            s = str(s)

            n = 4
            fromParts = [c for c in s]
            i = 0

            while i < n - len(s):
                fromParts.insert(0, '0')
                i = i + 1

            s = ''.join(fromParts)
            s = s[:(n/2)] + "/" + s[(n/2):n]
            # end pid encoding
            page = 1


            # build the url (http://www.toysrus.com/pwr/content/10/07/24447876-en_US-12-reviews.js)
            base_url = 'http://www.toysrus.com/pwr/content/%s/%s-en_US-%s-reviews.js'
            done = False
            decoded_data = []
            exportable_data = []
            it = 0
            while not done:
                review_page_url = base_url % (s, pid, page)

                try:
                    review_data_page = urllib2.urlopen(review_page_url)
                    review_data_lines = []
                    review_data = ''

                    for line in review_data_page.readlines():
                        review_data_lines.append(line)
                    review_data_raw = ''.join(review_data_lines)

                    # split at the first '=' char
                    review_data_bits = review_data_raw.split(' = ', 1)
                    review_data = review_data_bits[1]
                    review_data = review_data.replace(';', '')
                    decoded_data = demjson.decode(review_data)
                    for reviews in decoded_data:
                        review = reviews['r']

                        it = it + 1
                        exportable_data.append(
                            {
                                'sku': sku,
                                'title': review['h'],
                                'rating': review['r'],
                                'text': review['p'],
                                'submissionTime': review['db'],
                                'displayName': review['n'],
                                'externalId': review['id'],
                                'emailAddress': it,
                            }
                        )

                except:
                    done = True
                page = page + 1

            export_to_file(options['format'], exportable_data, pid)