def __init__(self, source=0, test=0, *args, **kwargs):
        super(NewURLsSpider, self).__init__(*args, **kwargs)
        self.source = source
        self.test = test

        # Store the source and date in a report summary variable
        self.report_summary = []
        self.report_summary.append("Source: %s" % source)
        self.report_summary.append("Test: %s" % test)
        self.report_summary.append(
            "Date: %s" % (datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')))
        # Create a page counter variable
        self.page_counter = 1
        # Create a product counter variable
        self.product_counter = 0
        # Return an error in case no source code was passed along.
        if self.source == 0:
            self.logger.critical("No source code was passed along!")
            #sys.exit()
            raise CloseSpider('No source.')

        # Set the xpath dictionary.
        # Which stores all the information per source in the custom
        # module called product_scrape_xpaths.
        self.xpath_dict = product_scrape_xpaths.get_dict()
    def start_requests(self):  
    
        # Now fetch some URLs from the product_urls table.
        # Define which table to use according to Test or Real
        if (self.test == 1) or (self.test == '1'):
            query = ("SELECT `id`, `url` "
                "FROM `product_urls` "
                "WHERE `source` = %s AND `status` = 0 "
                "LIMIT {0} ".format(self.limit)
                )
            self.cursor.execute(query,(self.source,))
        else:
            query = ("SELECT `id`, `url` "
                "FROM `product_urls_SCRAPY` "
                "WHERE `source` = %s AND `status` = 0 "
                "LIMIT {0} ".format(self.limit)
                )
            self.cursor.execute(query,(self.source,))
                
        rows = self.cursor.fetchall()
        
        # First store the dictionary with source xpaths in a variable,
        # as obtained from the customer product_scrape_xpaths module.
        self.xpath_dict = product_scrape_xpaths.get_dict()
        
        
        # Because the duplicate filter whe need to set some sources to no filtering because of the not available page
        sources_no_filtering = ['6', '7', '20', '25', '31', '21']
        if self.source in sources_no_filtering:
            no_filter=True
        else:
            no_filter=False        

        
        # Loop through each of the rows, as row.
        # And initiate a scrape for each of them based on the URL, 
        # also pass along the id of the row from the product_urls table as
        # meta data (for later reference), and handle it through the parse 
        # function.
        
        
        for row in rows:
            # Edit the url with some query parameters, for setting USD or location
            query = self.xpath_dict[self.source]['query_url']
            request_url = row[1] + query
        
            yield scrapy.Request(
                url=request_url, meta={'id': row[0], 'url': row[1]}, 
                callback=self.parse, dont_filter=no_filter)
        
        """
예제 #3
0
    def process_response(self, request, response, spider):
        
        if (request.meta.get('dont_redirect', False) or
                response.status in getattr(spider, 'handle_httpstatus_list', []) or
                response.status in request.meta.get('handle_httpstatus_list', []) or
                request.meta.get('handle_httpstatus_all', False)):
            
            return response

        allowed_status = (301, 302, 303, 307)
        
        
        
        # 30 -TVC mall is returng a 301 withouth header location
        # This code is not compataible with the ScrapeThumbs Spider and ScrapeOldThumbs Spider
        
        # Because the spider.source attribute is not available.
        not_allowed_source = ('30') 
        
        
        if (( 'Location' not in response.headers and spider.source not in not_allowed_source ) or response.status not in allowed_status):
            return response
        
        if (spider.source in not_allowed_source):
            # Get the dict per source with the xpaths
            self.xpath_dict = product_scrape_xpaths.get_dict()
            location = safe_url_string( self.xpath_dict[spider.source]['na_url'] )
            
        else:
            location = safe_url_string(response.headers['location'])
            

        redirected_url = urljoin(request.url, location)
        # ADDED 10-11-2016 For 302 redirects to the mobile webpage!
        # if re.match('^http[s]?:\/\/m[\.][^\.]+[\.]com[\.]*', redirected_url) is not None:
        # redirected_url = redirected_url.replace("//m.", "//www.")
        
        if response.status in (301, 307) or request.method == 'HEAD':
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        redirected = self._redirect_request_using_get(request, redirected_url)
        return self._redirect(redirected, request, spider, response.status)
 def __init__(self, source = 0, test = 0, limit = 0, *args, **kwargs):
     super(NewURLsSpider, self).__init__(*args, **kwargs)    
     self.source = source
     self.test = test
     self.limit = limit
     
     # Store the source and date in a report summary variable
     self.report_summary = []
     self.report_summary.append("Source: %s" % source)
     self.report_summary.append("Test: %s" % test)
     self.report_summary.append("Date: %s" % (
         datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')))
     # Create a page counter variable
     self.page_counter = 1
     # Create a product counter variable
     self.product_counter = 0
     # Return an error in case no source code was passed along.
     if self.source == 0:
         self.logger.critical("No source code was passed along!")
         #sys.exit()
         raise CloseSpider('No source.')
     # Set the xpath dictionary.
     # Which stores all the information per source in the custom
     # module called product_scrape_xpaths.
     self.xpath_dict = product_scrape_xpaths.get_dict()
 
     try:
         # Start the db connection through the custom module.
         self.conn = mysql_connection.setup_conn()
         self.cursor = self.conn.cursor()           
     
     except MySQLdb.Error, e:
         try:
             print "MySQL Error [%d]: %s" % (e.args[0], e.args[1])
         except IndexError:
             print "MySQL Error: %s" % str(e)
class UpdateProductSpider(scrapy.Spider):
    # This spider is allowed to handle responses with a 404
    handle_httpstatus_list = [404]
    custom_settings = {
        'ITEM_PIPELINES': {
            'scraper1.pipelines.pipelines.MySQLUpdateProduct': 100
        }
    }

    # Set the short-name for this spider:
    name = "ScrapeUpdateProduct"
    # Restrict the domain name which the spider can operate on.
    # Make sure to append the list for new sources.
    allowed_domains = [
        "dx.com", "banggood.com", "focalprice.com", "miniinthebox.com",
        "lightinthebox.com", "tmart.com", "gearbest.com", "tinydeal.com",
        "geekbuying.com", "dealsmachine.com", "newfrog.com", "tomtop.com",
        "fasttech.com", "chinavasion.com", "tvc-mall.com", "antelife.com",
        "cafago.com", "chinabuye.com", "dinodirect.com", "sunsky-online.com",
        "cndirect.com", "zapals.com"
    ]

    # Define the initializing function, used to catch the source number
    # passed along while running the spider as an argument (-a).
    def __init__(self,
                 source=0,
                 test=0,
                 limit=0,
                 group=0,
                 cats=0,
                 images=0,
                 descrp=0,
                 *args,
                 **kwargs):
        super(UpdateProductSpider, self).__init__(*args, **kwargs)
        self.source = source
        self.test = test
        self.limit = limit
        self.updateCategories = cats
        self.updateDescriptions = descrp
        self.updateImages = images
        self.group = group

        # Set the right database Tabels according to if we are Real or Testing
        if (self.test == 1) or (self.test == '1'):
            self.urls_table = 'product_urls'
            self.details_table = 'product_details'
        if (self.test == 0) or (self.test == '0'):
            self.urls_table = 'product_urls_SCRAPY'
            self.details_table = 'product_details_SCRAPY'

        # Store the source and date in a report summary variable
        self.report_summary = []
        self.report_summary.append("Source: %s" % source)
        self.report_summary.append(
            "Date: %s" % (datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')))

        # Return an error in case no source code was passed along.
        if self.source == 0:
            logging.critical("No source code was passed along!")
            sys.exit()

    # Overwrite the default start_request function.
    # In order to be able to pull  (start) URLs from the database,
    # and run them in concurrently.
    def start_requests(self):

        # Products are divided in different Groups
        # Stock, Not in Stock and Not Active (urls.status= = 4)

        # In Stock
        if (self.group == 0 or self.group == '0'):
            stock = 0  # 0 is in stock
            status = 1
            self.updatetime = time.time() - 172800  # seconds/48 Hours

        # Out Stock
        if (self.group == 1 or self.group == '1'):
            stock = 1
            status = 1
            self.updatetime = time.time() - 604800  # seconds/7 days

        # Not Active
        if (self.group == 2 or self.group == '2'):
            status = 4
            self.updatetime = time.time() - 2592000  # seconds/30 days

        # For Faster Update Sequence
        if (self.group == 9 or self.group == '9'):
            stock = 0  # 0 is in stock
            status = 1
            self.updatetime = time.time() - 7200  # seconds/48 Hours

        # Overwrite to 1 second for test purposes
        if (self.test == 0) or (self.test == '0'):
            self.updatetime = time.time() - 1  # 1 second!

        # Starting the DB connection
        try:
            # Start the db connection through the custom module.
            self.conn = mysql_connection.setup_conn()
            self.cursor = self.conn.cursor()

        except MySQLdb.Error, e:
            try:
                print "MySQL Error [%d]: %s" % (e.args[0], e.args[1])
            except IndexError:
                print "MySQL Error: %s" % str(e)

        if (self.updateImages == 1 or self.updateImages == '1'):
            # Different Query If we want to scrape New Images
            # DELETE LATER IMG_STATUS != 9
            query = (
                "SELECT urls.id, urls.url, urls.url_hash, urls.pid "
                "FROM `{0}` As urls "
                "INNER JOIN `{1}` As details "
                "ON urls.pid = details.id "
                "WHERE urls.processing = 0 AND urls.source = %s AND urls.status = 1 AND urls.img_status != 9  AND (details.thumbs_extra = 0 OR details.thumbs_extra = 1) "
                "LIMIT {2}".format(self.urls_table, self.details_table,
                                   self.limit))

            logging.debug("We got the Image Query")

            self.cursor.execute(query, (self.source, ))

        # The normal query for Active Products
        elif (self.group == '0' or self.group == '1'):
            # For Clean purposes also take `updated` IS NULL with the query.
            query = (
                "SELECT urls.id, urls.url, urls.url_hash, urls.pid "
                "FROM `{0}` As urls "
                "INNER JOIN `{1}` As details "
                "ON urls.pid = details.id "
                "WHERE urls.processing = 0 AND urls.source = %s AND urls.status = %s AND details.stock = %s AND (details.updated < {2} OR details.updated IS NULL) "
                "ORDER BY details.stock "
                "LIMIT {3} ".format(self.urls_table, self.details_table,
                                    self.updatetime, self.limit))

            self.cursor.execute(query, (
                self.source,
                status,
                stock,
            ))

        # The  query for Not-Active Products
        # NEED TO BE UPDATED DNRY!
        else:
            # For Clean purposes also take `updated` IS NULL with the query.
            query = (
                "SELECT urls.id, urls.url, urls.url_hash, urls.pid "
                "FROM `{0}` As urls "
                "INNER JOIN `{1}` As details "
                "ON urls.pid = details.id "
                "WHERE urls.processing = 0 AND urls.source = %s AND urls.status = %s AND (details.updated < {2} OR details.updated IS NULL) "
                "ORDER BY details.stock "
                "LIMIT {3} ".format(self.urls_table, self.details_table,
                                    self.updatetime, self.limit))

            self.cursor.execute(query, (
                self.source,
                status,
            ))

        rows = self.cursor.fetchall()

        if rows:
            # Get a list of selected ID's
            # Because the processing boolean we are able to use multiple Spiders at once.
            # Update these ID's in the urls table and SET processing to 1
            id_list = []

            for row in rows:
                id_list.append(row[0])

            logging.debug("What is the id_list: %s", id_list)

            string_id_list = ','.join(map(str, id_list))
            self.id_list = string_id_list

            query = ("UPDATE `{0}` As urls "
                     "SET urls.processing = 1 "
                     "WHERE urls.source = %s AND urls.id IN ( {1} ) ".format(
                         self.urls_table, string_id_list))

            self.cursor.execute(query, (self.source, ))

            self.conn.commit()

        # Close the db connection when done.
        self.conn.close()

        # Get the dict per source with the xpaths
        self.xpath_dict = product_scrape_xpaths.get_dict()

        # Because the duplicate filter whe need to set some sources to no filtering because of the not available page
        sources_no_filtering = [
            '6', '7', '20', '22', '25', '28', '30', '31'
        ]  # 08-11-2016 - Added GearBest 28 - 27-7-2017 Added TVC 30
        if self.source in sources_no_filtering:
            no_filter = True
        else:
            no_filter = False

        # Loop through each of the rows, as row.
        # And initiate a scrape for each of them based on the URL,
        # also pass along the id of the row from the product_urls table as
        # meta data (for later reference), and handle it through the parse
        # function.
        for row in rows:
            # Edit the url with some query parameters, for setting USD or location
            query = self.xpath_dict[self.source]['query_url']
            request_url = row[1] + query

            # 11-08-16 Changed dont_redirect to False. Because of 301 and 302 not allowed issues
            if re.match('^http[s]?:\/\/www[\.][^\.]+[\.]com[\.]*',
                        row[1]) is not None:
                yield scrapy.Request(url=request_url,
                                     meta={
                                         'id': row[0],
                                         'url_hash': row[2],
                                         'pid': row[3],
                                         'dont_redirect': False
                                     },
                                     callback=self.parse,
                                     dont_filter=no_filter)
            else:
                logging.warning('Request Error on ID: %s for url %s', row[0],
                                request_url)
                continue