Пример #1
0
def get_product_task( prdid ):
    cli = get_cli()
    d = cli.get( KEY_PRODUCT_TASK.format( p = prdid ) )
    if d is None:
        return None

    try:
        return json.loads( d )
    except Exception as e:
        logger.info( 'get task: {p} {d} {e}'.format(
                    p = prdid, d = repr( d ), e = repr( e ) ) )
Пример #2
0
def get_product_task(prdid):
    cli = get_cli()
    d = cli.get(KEY_PRODUCT_TASK.format(p=prdid))
    if d is None:
        return None

    try:
        return json.loads(d)
    except Exception as e:
        logger.info('get task: {p} {d} {e}'.format(p=prdid,
                                                   d=repr(d),
                                                   e=repr(e)))
Пример #3
0
def next_product_url():
    cli = get_cli()

    while True:
        product = cli.lpop(KEY_PRODUCTS)
        if product is None:
            logger.info('no product in queue ...')
            time.sleep(5)
            continue

        try:
            product = json.loads(product)['asin']

            if not product_is_timeout(product):
                logger.info( 'product : ' + product + \
                                ' is not timeout, {day} days'.format(
                                        day = CRAWL_PRODUCT_TIMEOUT ) )
                continue

            set_product_task(product)
            del_product_reviews(product)

            logger.info('next product : ' + product)
            return 'http://www.amazon.com/ss/customer-reviews/' + product

        except Exception as e:
            logger.exception(repr(e))
Пример #4
0
def next_product_url():
    cli = get_cli()

    while True:
        product = cli.lpop( KEY_PRODUCTS )
        if product is None:
            logger.info( 'no product in queue ...' )
            time.sleep( 5 )
            continue

        try:
            product = json.loads( product )[ 'asin' ]

            if not product_is_timeout( product ):
                logger.info( 'product : ' + product + \
                                ' is not timeout, {day} days'.format(
                                        day = CRAWL_PRODUCT_TIMEOUT ) )
                continue

            set_product_task( product )
            del_product_reviews( product )

            logger.info( 'next product : ' + product )
            return 'http://www.amazon.com/ss/customer-reviews/' + product

        except Exception as e:
            logger.exception( repr( e ) )
Пример #5
0
    def parse(self, response):
        #with open( '/tmp/amazon.html', 'w' ) as f:
        #    f.write( response.body )

        logger.info('fetch : ' + response.url)
        prdid = response.url.split('?')[0].split('/')[-1]

        review = f_xpath(response, '//table[@id="productReviews"]/tr/td')
        if review is None:
            yield scrapy.http.Request(url=next_product_url(),
                                      callback=self.parse)

        rids = xpath_extract(review, './a/@name')
        details = xpath(review, './div')

        lenth = min(len(rids), len(details))
        for i in xrange(lenth):

            rdetail = details[i]
            divs = xpath(rdetail, './div')

            # max of len( divs ) is 7, ( 0 - 6 )
            # 0 : number of helpful review
            # 1 : star, helpful text, date
            # 2 : reviewer, reviewer from
            # 3 : from
            # 4 : free product
            # 5 : reviewText
            # 6 : helpful?

            d = self.empty_item()
            d['prdid'] = prdid
            d['rid'] = rids[i]
            d['text'] = ' '.join(
                xpath_extract(rdetail, './div[@class="reviewText"]/text()'))

            while len(divs) > 0:
                div = divs[0]
                divs = divs[1:]

                text = div.extract()

                if 'people found the following review helpful' in text:
                    d['num_help_review'] = self.parse_num_help_review(div)
                    continue

                if 'out of' in text and 'stars' in text and '</nobr>' in text:
                    d[ 'star' ], d[ 'help_text' ], d[ 'date' ] = \
                                self.parse_star_help_date( div )
                    continue

                if 'By' in text and 'See all my reviews' in text:
                    d[ 'reviewer' ], d[ 'reviewer_from' ] = \
                                self.parse_reviewer_from( div )
                    continue

                if 'This review is from' in text:
                    d['rfrom'] = self.parse_from(div)

                break

            yield d

        next_url = self.next_page( response ) or \
                        next_product_url()

        # see http://doc.scrapy.org/en/latest/topics/request-response.html
        yield scrapy.http.Request(url=next_url,
                                  callback=self.parse,
                                  dont_filter=True)
Пример #6
0
    def parse(self, response):
        #with open( '/tmp/amazon.html', 'w' ) as f:
        #    f.write( response.body )

        logger.info( 'fetch : ' + response.url )
        prdid = response.url.split( '?' )[ 0 ].split( '/' )[ -1 ]

        review = f_xpath( response, '//table[@id="productReviews"]/tr/td' )
        if review is None:
            yield scrapy.http.Request( url = next_product_url(),
                                       callback = self.parse )

        rids = xpath_extract( review, './a/@name' )
        details = xpath( review, './div' )

        lenth = min( len( rids ), len( details ) )
        for i in xrange( lenth ):

            rdetail = details[ i ]
            divs = xpath( rdetail, './div' )

            # max of len( divs ) is 7, ( 0 - 6 )
            # 0 : number of helpful review
            # 1 : star, helpful text, date
            # 2 : reviewer, reviewer from
            # 3 : from
            # 4 : free product
            # 5 : reviewText
            # 6 : helpful?

            d = self.empty_item()
            d[ 'prdid' ] = prdid
            d[ 'rid' ] = rids[ i ]
            d[ 'text' ] = ' '.join( xpath_extract( rdetail,
                                    './div[@class="reviewText"]/text()' ) )

            while len( divs ) > 0:
                div = divs[ 0 ]
                divs = divs[ 1: ]

                text = div.extract()

                if 'people found the following review helpful' in text:
                    d[ 'num_help_review' ] = self.parse_num_help_review( div )
                    continue

                if 'out of' in text and 'stars' in text and '</nobr>' in text:
                    d[ 'star' ], d[ 'help_text' ], d[ 'date' ] = \
                                self.parse_star_help_date( div )
                    continue

                if 'By' in text and 'See all my reviews' in text:
                    d[ 'reviewer' ], d[ 'reviewer_from' ] = \
                                self.parse_reviewer_from( div )
                    continue

                if 'This review is from' in text:
                    d[ 'rfrom' ] = self.parse_from( div )

                break

            yield d

        next_url = self.next_page( response ) or \
                        next_product_url()

        # see http://doc.scrapy.org/en/latest/topics/request-response.html
        yield scrapy.http.Request( url = next_url, callback = self.parse,
                                   dont_filter = True )