Exemplo n.º 1
0
class iwencaiSpider(scrapy.Spider):
    name = "wencai"
    allowed_domains = ["iwencai.com"]
    start_urls = [
        "http://www.iwencai.com/stockpick",
    ]
    source_currency = "RMB"
    picType = "jpg"
    debug = ""
    taskId = -1
    commonLib = False
    env_type = "offline"

    def __init__(self, *args, **kwargs):

        self.commonLib = Common()
        self.env_type = self.commonLib.get_env()
        if self.env_type == "online":
            self.debug = ""
        self.commonLib.set_header("env_type",self.env_type)
        self.commonLib.set_header("debug",self.debug)
        self.commonLib.write_log("get task id is [%s]" % (self.taskId))

    def parse(self, response):
        try:        

            # request = scrapy.Request(response.url, callback=self.query_indicator_list)
            # request.meta['method'] = "query_indicator_list"
            # yield request
            ## 获取技术指标列表
            # for option in response.xpath("//div[@class='area_item']/a[@name='lm_c_jszb']/../div//a[@class='other_link']/@href"):
            #     href = option.extract().strip()

            #     self.commonLib.write_log("get indicator list url is [%s]" % (href))
            #     request = scrapy.Request(href, callback=self.parse_indicator_list)
            #     yield request
                
            #     if self.debug:
            #         self.commonLib.write_log("debug")
            #         return
            # day = "2016年03月23日"
            # indicator = "MACD金叉"
            # yield self.query_indicator_by_day(indicator,day)
            request = scrapy.Request(response.url, callback=self.query_indicator)
            request.meta['method'] = "query_indicator"
            yield request
            
        except Exception, e:
            urlStatus = common.STATUS_FAIL
            exc_type, exc_value, exc_traceback = sys.exc_info()
            msgStr = self.commonLib.write_exception(exc_type, exc_value, exc_traceback)
            self.commonLib.write_log(msgStr)
            print (msgStr)
Exemplo n.º 2
0
class zaraSpider(scrapy.Spider):
    name = "zara"
    allowed_domains = ["zara.cn"]
    start_urls = [
        #"http://www.zara.cn/cn/zh/%E5%84%BF%E7%AB%A5-c277007.html",
        "http://www.zara.cn/cn/zh/%E5%84%BF%E7%AB%A5-c359013.html",
    ]
    source_currency = "RMB"
    picType = "jpg"
    debug = "true"
    taskId = -1
    commonLib = False
    env_type = "offline"

    def __init__(self, taskId=None, *args, **kwargs):
        super(zaraSpider, self).__init__(*args, **kwargs)
        #self.start_urls = ['http://www.example.com/categories/%s' % category]
        self.taskId = int(taskId)
        self.commonLib = Common()
        self.env_type = self.commonLib.get_env()
        if self.env_type == "online":
            self.debug = ""
        self.commonLib.set_header("env_type",self.env_type)
        self.commonLib.set_header("debug",self.debug)
        self.commonLib.write_log("get task id is [%s]" % (self.taskId))

    def parse(self, response):
        # ## 打折

        try:        
            expectCnt = 1
            actualCnt = 0

            top_bar_list = ["男婴","女婴","男童","女童"]
            urlStatus = common.STATUS_DONE
            assert self.taskId > 0, "taskId [%s] should not be null" % (self.taskId)
            ## //li[@rootid]/ul/li/ul/li

            barList = response.xpath("//li[@class='current selected']/ul/li/a")

            expectCnt = len(barList)
            for category in barList:
                category_url = category.xpath("@href")[0].extract().strip() + "#" + common.LEVEL_HOME
                top_bar_name = category.xpath("text()")[0].extract().strip()
                top_bar = top_bar_name.split(" ")[0].strip()

                actualCnt = actualCnt + 1
                # if top_bar not in top_bar_list:
                #     self.commonLib.write_log("top_bar [%s] is not child category" % (top_bar))
                #     continue
                
                self.commonLib.write_log("top_bar_name is [%s], top_bar is [%s],parse zara url is [%s],actualCnt is [%s] " % (top_bar_name,top_bar,category_url,actualCnt))

                product_info = {}
                product_info['top_bar'] = top_bar
                request = scrapy.Request(category_url, callback=self.parse_category_list)
                request.meta['product_info'] = copy.deepcopy(product_info)
                yield request

            assert actualCnt == expectCnt and expectCnt>0, "parse ActualCnt [%s] is not [equal] expectCnt [%s]" % (actualCnt,expectCnt)

        except Exception, e:
            urlStatus = common.STATUS_FAIL
            exc_type, exc_value, exc_traceback = sys.exc_info()
            msgStr = self.commonLib.write_exception(exc_type, exc_value, exc_traceback)
            self.commonLib.write_log(msgStr)
            yield common.addLog(msgStr,self.taskId,common.LOG_FATAL,response.url,self.name)
        finally: