示例#1
0
    def parse_product_detail(self, response):
        item = BankproductItem()
        item['bankCode'] = 'cmb'
        item['channel'] = 'app'
        item['proCode'] = re.search(
            'Code=(\d+)', str(response.request.body,
                              encoding='utf-8')).group(1)
        item['proName'] = self.__get_re_value(response.text,
                                              'prdname: "(.*?)"', 1)
        item['proType'] = self.__get_re_value(response.text,
                                              'prdtype: "(.*?)"', 1)
        item['incomeRateName'] = self.__get_xpath_value(
            response, "//span[@id='ctl00_cphBody_RatBre']/text()").strip()
        item['incomeRate'] = self.__get_xpath_value(
            response, "//span[@id='ctl00_cphBody_PrdRat']/text()").strip()
        item['cycleTime'] = self.__get_xpath_value(
            response, "//span[@id='ctl00_cphBody_TerDay']/text()").strip()
        item['riskLevel'] = self.__get_xpath_value(
            response, "//span[@id='ctl00_cphBody_RiskLvl2']/text()").strip()
        item['currentPurchases'] = self.__get_xpath_value(
            response, "//span[@id='ctl00_cphBody_SalAmt']/text()").strip()
        item['firstSubMinAmount'] = self.__get_xpath_value(
            response, "//span[@id='ctl00_cphBody_SbsUqt']/text()").strip()

        self.form_data3['Code'] = item['proCode']
        self.form_data3['behavior_prodcode'] = item['proCode']
        yield scrapy.FormRequest(self.rule_url,
                                 method="POST",
                                 meta={'item': item},
                                 formdata=self.form_data3,
                                 callback=self.parse_product_rules,
                                 dont_filter=True)
        pass
示例#2
0
    def parse(self, response):
        # 解析具体的产品
        content = self.__get_response_content(response)
        product_list = json.loads(content)['rows']
        # 返回的是对象集合
        for product_item in product_list:
            item = BankproductItem()
            item['bankCode'] = 'spd'
            item['channel'] = 'web'
            item['proCode'] = product_item['finance_no']
            item['proName'] = product_item['finance_allname']

            # 产品类型
            params = urllib.parse.unquote(
                str(response.request.body, encoding='utf-8'))
            proAttr = self.__get_re_value(params, 'product_type=(\d+)', 1)
            item['proAttr'] = proAttr.replace('0', '私行专属').replace('2', "净值类").replace('3', "固定期限") \
                .replace('4', "现金管理类") if proAttr else "汇理财"

            item['incomeRate'] = product_item['finance_anticipate_rate']
            item['riskLevel'] = product_item['finance_risklevel'].replace('A', "低风险等级").replace('B', "较低风险等级") \
                .replace('C', "中风险等级").replace('D', "较高风险等级").replace('E', "高风险等级")
            item['firstAmount'] = product_item['finance_indi_ipominamnt']
            item['nextOpenDate'] = product_item[
                'finance_next_openday'] if 'finance_next_openday' in product_item.keys(
                ) else ""
            item['instructionUrl'] = product_item[
                'product_attr'] if 'product_attr' in product_item.keys(
                ) else ""
            item['status'] = product_item['finance_state']

            channelid = re.search('channelid=(\d+)',
                                  str(response.request.body,
                                      encoding='utf-8')).group(1)
            if channelid == '266906':
                product_item_detail_url = '{}{}'.format(
                    self.detail_url, item['proCode'])
            elif channelid == '263468':
                product_item_detail_url = '{}{}'.format(
                    self.detail2_url, item['proCode'])
            yield scrapy.Request(product_item_detail_url,
                                 meta={'item': item},
                                 callback=self.parse_product_detail,
                                 dont_filter=True)
        # 是否存在下一页数据
        exist_data = re.search('"rows":\[([\S\s]+)\]', response.text)
        if exist_data:
            current_page = int(
                re.search('page=(\d+)',
                          str(response.request.body,
                              encoding='utf-8')).group(1))
            current_page = current_page + 1
            current_form_data = self.fromData2Dict(
                str(response.request.body, encoding='utf-8'))
            current_form_data['page'] = str(current_page)
            yield scrapy.FormRequest(self.start_url,
                                     method="POST",
                                     formdata=current_form_data,
                                     dont_filter=True)
        pass
示例#3
0
 def parse_organization(self, response):
     product_attr = response.xpath(
         "//div[@id='content']//div[@class='top']").css(
             "p>a:last-child::text").extract()[0]
     product_ul = response.xpath(
         "//div[@id='content']//div[@class='middle']/ul")
     for product_item in product_ul.xpath("li"):
         item = BankproductItem()
         item['bankCode'] = 'cib'
         item['channel'] = 'web'
         item['proAttr'] = product_attr
         item['proName'] = self.__get_xpath_value(product_item, "a/text()")
         time_str = self.__get_xpath_value(product_item,
                                           "span[@class='time']/text()")
         year = datetime.strptime(time_str, '%Y-%m-%d').year
         # 判断是否是今年的数据
         if year == datetime.now().year:
             next_page_url = "{}{}".format(
                 self.base_url,
                 self.__get_xpath_value(product_item, "a/@href").strip())
             yield scrapy.Request(next_page_url,
                                  meta={'item': item},
                                  callback=self.product_announcement,
                                  dont_filter=True)
         else:
             yield item
     pass
示例#4
0
    def parse(self, response):
        selector = Selector(response)
        for product_item in selector.xpath(
                "//*[@id='content']//li[@name='pageli']"):
            item = BankproductItem()
            item['bankCode'] = 'hxb'
            item['channel'] = 'web'

            item['proName'] = self.__get_xpath_value(
                product_item,
                "div[@class='pro_box']/p[@class='box_title']/a/text()").strip(
                )
            item['cycleTime'] = self.__get_xpath_value(
                product_item,
                "div[@class='pro_box']/ul//span[@class='highlight']/text()"
            ).strip()
            title = self.__get_xpath_value(
                product_item,
                "div[@class='pro_box']/div[@class='box_lf']/p[2]/text()"
            ).strip()

            if title == '预期最高年化收益率':
                item['incomeRateName'] = title
                item['incomeRate'] = self.__get_xpath_value(
                    product_item,
                    "div[@class='pro_box']//p[@class='box_num']/text()").strip(
                    )
            else:
                item['proNetValue'] = self.__get_xpath_value(
                    product_item,
                    "div[@class='pro_box']//p[@class='box_num']/text()").strip(
                    )

            item['startDate'] = re.search(
                '(.*?)至',
                self.__get_xpath_value(
                    product_item,
                    "div[@class='pro_box']//span[text()='发售日期']/../span[2]/text()"
                )).group(1)
            item['endDate'] = re.search(
                '至(.*)',
                self.__get_xpath_value(
                    product_item,
                    "div[@class='pro_box']//span[text()='发售日期']/../span[2]/text()"
                )).group(1)
            item['firstSubMinAmount'] = self.__get_xpath_value(
                product_item,
                "div[@class='pro_box']//span[@class='amt']/text()"
            ).strip() + self.__get_xpath_value(
                product_item,
                "div[@class='pro_box']//span[@class='amt']/following-sibling::text()"
            ).strip()
            sellChannel = self.__get_xpath_value(
                product_item,
                "div[@class='pro_box']//span[text()='购买渠道']/../span[2]/text()"
            ).strip()
            # 替换不可见字符
            item['sellChannel'] = re.sub('[\r\n\t\s]', '', sellChannel)
            yield item
        pass
示例#5
0
    def parse_private(self, response):
        selector = Selector(response)
        # 遍历产品类型
        for product_type in selector.xpath(
                "//*[@id='content']//div[@class='middle']/p[not(@align)]"):
            # 找到兄弟节点
            table_selector = product_type.xpath("following-sibling::table[1]")
            title_num = len(table_selector.xpath("tbody/tr[1]").css("td"))
            # 遍历除了表头的元素
            for index, product_item in enumerate(
                    table_selector.xpath(
                        "tbody/tr[not(contains(td,'产品名称'))]")):
                child_num = len(product_item.xpath("td"))
                item = BankproductItem()
                item['bankCode'] = 'cib'
                item['channel'] = 'web'
                item['proType'] = self.__get_xpath_value(
                    product_type, "strong/text()").strip()[2:]

                # 获取产品名称和代码(xpath <br/>会转化为两个节点)
                name_code = product_item.xpath("td[1]/text()").extract()
                if len(name_code) == 1:
                    length = len(
                        re.findall(r"[\((][^\((]+[\))]", name_code[0].strip()))
                    # 含有多个括号的情况,取最后一个
                    if length > 1:
                        product_code = re.findall(
                            r"[\((][^\((]+[\))]",
                            name_code[0].strip())[length - 1]
                    else:
                        product_code = re.search(r"[\((][^\((]+[\))]",
                                                 name_code[0].strip()).group(0)
                        pass
                else:
                    product_code = name_code[1].strip()[1:-1]

                item['proCode'] = product_code
                item['proName'] = name_code[0][0:name_code[0].find(str(item['proCode']))-1] if len(name_code) == 1 else \
                    name_code[0].strip()

                # 是否含有合并行
                if child_num == title_num:
                    item['cycleTime'] = self.__get_xpath_value(
                        product_item, "td[3]/text()").strip()
                    item['nextIncomeRate'] = self.__get_xpath_value(
                        product_item, "td[4]/text()").strip()
                    pass
                else:
                    item['cycleTime'] = self.__get_xpath_value(
                        product_item, "td[2]/text()").strip()
                    item['nextIncomeRate'] = self.__get_xpath_value(
                        product_item, "td[3]/text()").strip()

                yield item
        pass
示例#6
0
    def parse(self, response):
        productItems = json.loads(response.text)['respData']['list']
        for productItem in productItems:
            item = BankproductItem()
            item['bankCode'] = 'cmbc'
            item['channel'] = 'app'
            item['proCode'] = productItem['PRD_CODE'].strip()
            item['proName'] = productItem['PRD_NAME'].strip()
            item['proAttr'] = productItem['PRD_ATTR'].strip()
            # PRD_TYPE(0:每日型,1:定期开放型,2:封闭型,3:收益型,4:净值类周期型,5:活期型),
            item['proType'] = productItem['PRD_TYPE_NAME'].strip()
            item['incomeRateName'] = productItem['INCOME_TYPE']
            item['incomeRate'] = productItem['INCOME_RATE']
            item['nextIncomeRate'] = productItem['NEXT_INCOME_RATE']
            item['proNetValue'] = productItem['NAV']
            item['openDate'] = productItem['START_DATE'].strip()
            item['realEndDate'] = productItem['REALEND_DATE'].strip()
            item['cycleTime'] = productItem['LIV_TIME_UNIT_NAME'].strip()
            item['firstAmount'] = productItem['FIRST_AMT']
            # currency(156:人民币,840:美元)
            item['currency'] = productItem['CURR_TYPE_NAME'].strip()

            self.request_data2['prdCode'] = item['proCode']
            yield scrapy.Request(self.start_url,
                                 method="POST",
                                 body=json.dumps(self.request_data2),
                                 meta={'item': item},
                                 headers={'Content-Type': 'application/json'},
                                 callback=self.parse_product_detail,
                                 dont_filter=True)

        # 是否存在下一页数据
        exist_data = re.search('"PRD_CODE":"(.*?)"', response.text)
        if exist_data:
            current_startId = int(
                self.__get_re_value(
                    str(response.request.body, encoding='utf-8'),
                    '"startId": "(.*?)"', 1))
            pagesize = int(
                self.__get_re_value(
                    str(response.request.body, encoding='utf-8'),
                    '"pageSize": "(\d+)"', 1))
            next_startId = current_startId + pagesize
            self.request_data['startId'] = str(next_startId)
            yield scrapy.Request(self.start_url,
                                 method="POST",
                                 body=json.dumps(self.request_data),
                                 headers={'Content-Type': 'application/json'},
                                 dont_filter=True)

        pass
示例#7
0
 def parse_retail_cash(self, response):
     table = response.xpath(
         "//*[@id='content']//div[@class='middle']/table")
     proAttr = self.__get_xpath_value(
         response,
         "//*[@id='content']//div[@class='middle']/h1/text()").strip()
     for product_item in table.xpath("tbody/tr[not(contains(td,'产品名称'))]"):
         item = BankproductItem()
         item['bankCode'] = 'cib'
         item['channel'] = 'web'
         item['proAttr'] = proAttr
         item['proName'] = self.__get_xpath_value(
             product_item, "td[1]/strong/a/text()|td[1]/a/text()").strip()
         item['sellArea'] = self.__get_xpath_value(
             product_item, "td[2]/text()|td[2]/strong/text()").strip()
         item['currency'] = self.__get_xpath_value(
             product_item, "td[3]/text()|td[3]/strong/text()").strip()
         item['cycleTime'] = self.__get_xpath_value(
             product_item, "td[4]/text()|td[4]/strong/text()").strip()
         item['proType'] = self.__get_xpath_value(
             product_item, "td[5]/text()|td[5]/strong/text()").strip()
         item['firstAmount'] = self.__get_xpath_value(
             product_item, "td[6]/text()|td[6]/strong/text()").strip()
         item['incomeRateName'] = self.__get_xpath_value(
             table,
             "tbody/tr[1]/td[7]/strong/text()|tbody/tr[1]/td[7]/text()"
         ).strip()
         item['incomeRate'] = self.__get_xpath_value(
             product_item, "td[7]/text()|td[7]/strong/text()").strip()
         product_pic = product_item.xpath("td[8]/img/@src")
         item['proCode'] = re.search(
             'lccp(.*?).png',
             product_pic.extract()[0]).group(1) if product_pic else ''
         # 判断是否有超链接
         href_num = len(
             product_item.xpath(
                 'td[1]/strong/a/@href|td[1]/a/@href').extract())
         if href_num:
             next_page_url = self.__get_xpath_value(
                 product_item, "td[1]/strong/a/@href|td[1]/a/@href")
             yield scrapy.Request(next_page_url,
                                  meta={'item': item},
                                  callback=self.product_announcement,
                                  dont_filter=True)
             pass
         else:
             yield item
     pass
示例#8
0
    def parse(self, response):
        # 解析各个产品
        # for product_item in re.findall('lccpsj/(.*?)/index', response.text):
        for product_item in response.xpath("//div[@class='lccp_main_content_tx']/ul/li"):
            item = BankproductItem()
            item['bankCode'] = 'ceb'
            item['channel'] = 'web'
            item['incomeRate'] = self.__get_xpath_value(product_item,
                                                        "p[@class='lccp_syl']/span[@class='lccp_ll fc_box']/text()").strip()
            product_item_url = "{}{}".format(self.base_url, self.__get_xpath_value(product_item, "a/@href").strip())
            yield scrapy.Request(product_item_url, callback=self.parse_product_detail, meta={'item': item}, dont_filter=True)

        # 是否存在下一页数据
        exist_data = re.search('cpmc-(.*?)', response.text)
        if exist_data:
            # response.request.body获取请求中的body
            page_index = int(re.search('page=(\d+)', str(response.request.body, encoding='utf-8')).group(1)) + 1
            self.form_data['page'] = str(page_index)
            yield scrapy.FormRequest(self.start_url, method='POST', formdata=self.form_data, dont_filter=True)
示例#9
0
    def parse_retail_index(self, response):
        selector = Selector(response)
        for product_item in selector.xpath("//tbody/tr"):
            item = BankproductItem()
            item['bankCode'] = 'cib'
            item['channel'] = 'web'
            item['proCode'] = re.search(
                'lccp(.*?).png',
                product_item.xpath('td[9]/img/@src').extract()[0]).group(1)
            item['proAttr'] = '零售理财'
            # 判断属否有超链接
            proName = re.search('<a[\S\s]*>(.*?)</a>',
                                product_item.xpath('td[1]').extract()[0])
            item['proName'] = proName.group(1) if (
                proName != None
            ) else product_item.xpath('td[1]/text()').extract()[0]

            item['incomeRate'] = product_item.xpath(
                'td[7]/text()').extract()[0].strip()
            item['currency'] = product_item.xpath(
                'td[4]/text()').extract()[0].strip()
            item['startDate'] = product_item.xpath(
                'td[2]/text()').extract()[0].strip()
            item['endDate'] = product_item.xpath(
                'td[3]/text()').extract()[0].strip()
            # 大额客户参考净收益率(客户要求放在next_income_rate)
            item['nextIncomeRate'] = product_item.xpath(
                'td[8]/text()').extract()[0].strip()

            # 判断是否含有超链接
            href_num = len(product_item.xpath('td[1]/a/@href').extract())
            if href_num > 0:
                next_page_url = "{}{}".format(
                    self.base_url,
                    product_item.xpath('td[1]/a/@href').extract()[0])
                yield scrapy.Request(next_page_url,
                                     meta={'item': item},
                                     callback=self.parse_product_detail,
                                     dont_filter=True)
            else:
                yield item
        pass
示例#10
0
    def parse_product_detail(self, response):
        url = response.url
        content = self.__get_response_content(response)
        item = BankproductItem()

        item['bankCode'] = 'cmbc'
        item['channel'] = 'web'

        product = json.loads(content)['returnData']
        item['proCode'] = product['PRD_CODE'].strip()
        item['proName'] = product['PRD_NAME'].strip()
        item['proAttr'] = product['PRD_ATTR_NAME'].strip()
        item['proType'] = product['PRD_TYPE_NAME'].strip()
        item['sellObject'] = product['SELLDIR'].strip()
        item['status'] = product['STATUS_NAME'].strip()
        item['currency'] = product['CURR_TYPE_NAME'].strip()
        item['crFlag'] = product['CRFLAGNAME'].strip()
        item['startDate'] = product['IPO_START_DATE'].strip()
        item['endDate'] = product['IPO_END_DATE'].strip()
        item['openDate'] = product['START_DATE'].strip()
        item['nextOpenDate'] = product['PRD_NEXT_DATE'].strip()
        item['nextEndDate'] = product['EDDATE'].strip()
        item['realEndDate'] = product['REALEND_DATE'].strip()
        item['cycleTime'] = product['LIV_TIME_UNIT_NAME'].strip()
        item['incomeRate'] = product['INCOME_RATE'].strip()
        item['nextIncomeRate'] = product['Next_Income_Rate'].strip()
        item['interestType'] = product['INTEREST_TYPE_NAME'].strip()
        item['riskLevel'] = product['RISK_LEVEL_NAME'].strip()
        item['openTime'] = product['OPEN_TIME'].strip()
        item['closeTime'] = product['CLOSE_TIME'].strip()
        item['firstSubMinAmount'] = product['PFIRST_AMT'].strip()
        item['minRedBalance'] = product['PRED_UNIT'].strip()
        item['minSubUnit'] = product['PSUB_UNIT'].strip()
        item['maxSingleSub'] = product['PMAX_AMT'].strip()
        item['maxSingleRed'] = product['PMAX_RED'].strip()
        item['maxOneDaySub'] = product['PDAY_MAX'].strip()
        item['plainHold'] = product['PMIN_HOLD'].strip()
        item['proNetValue'] = product['NAV'].strip()
        yield item
示例#11
0
 def parse_product_detail(self, response):
     item = BankproductItem()
     item['bankCode'] = 'czb'
     item['channel'] = 'web'
     product_name = self.__get_xpath_value(
         response, "div[contains(@class,'nameLC')]/h3/text()").strip()
     item['proCode'] = re.search(
         '型(.*)', product_name).group(1) if re.search(
             '型(.*)', product_name) else product_name
     item['proName'] = product_name
     firstAmount = self.__get_xpath_value(
         response, "div[contains(@class,'nameLC')]/p/text()").strip()
     item['firstAmount'] = re.search('(.*)起购', firstAmount).group(1)
     item['incomeRateName'] = self.__get_xpath_value(
         response,
         "div[contains(@class,'num_det')]/div[@class='fl_num']/p[@class='num_txt']/text()"
     )
     item['incomeRate'] = self.__get_xpath_value(
         response,
         "div[contains(@class,'num_det')]/div[@class='fl_num']/p[1]/text()"
     ).strip()
     item['currency'] = self.__get_xpath_value(
         response,
         "div[contains(@class,'num_det')]/div[@class='mid_date' and contains(p[@class='num_txt'],'币种')]/p[1]/text()"
     ).strip()
     item['cycleTime'] = self.__get_xpath_value(
         response,
         "div[contains(@class,'num_det')]/div[@class='mid_date' and contains(p[@class='num_txt'],'理财期限')]/p[1]/text()"
     ).strip()
     item['endDate'] = self.__get_xpath_value(
         response,
         "div[contains(@class,'num_det')]/div[@class='fr_date' and contains(p[@class='num_txt'],'认购截止日')]/p[1]/text()"
     ).strip()
     item['openTime'] = self.__get_xpath_value(
         response,
         "div[contains(@class,'num_det')]/div[@class='fr_date' and contains(p[@class='num_txt'],'申购时间')]/p[1]/text()"
     ).strip()
     return item
示例#12
0
 def parse_retail_zyb(self, response):
     table = response.xpath(
         "//*[@id='content']//div[@class='middle']/table")
     proAttr = self.__get_xpath_value(
         response,
         "//*[@id='content']//div[@class='middle']/h1/text()").strip()
     for product_item in table.xpath("tbody/tr[not(contains(td,'产品名称'))]"):
         item = BankproductItem()
         item['bankCode'] = 'cib'
         item['channel'] = 'web'
         item['proAttr'] = proAttr
         item['proName'] = self.__get_xpath_value(product_item,
                                                  "td[1]/text()").strip()
         item['sellArea'] = self.__get_xpath_value(product_item,
                                                   "td[2]/text()").strip()
         item['proType'] = self.__get_xpath_value(product_item,
                                                  "td[3]/text()").strip()
         item['firstAmount'] = self.__get_xpath_value(
             product_item, "td[4]/text()").strip()
         # 募集时间
         raise_date = self.__get_xpath_value(product_item,
                                             "td[5]/text()").strip()
         item['startDate'] = self.__get_re_value(raise_date, "(.*)-(.*)",
                                                 1).strip()
         year = datetime.strptime(item['startDate'], '%Y年%m月%d日').year
         item['endDate'] = str(year) + "年" + self.__get_re_value(
             raise_date, "(.*)-(.*)", 2).strip()
         item['incomeRateName'] = self.__get_xpath_value(
             table, "tbody/tr[1]/td[6]/text()").strip()
         item['incomeRate'] = self.__get_xpath_value(
             product_item, "td[6]/text()").strip()
         product_pic = product_item.xpath("td[9]/img/@src")
         item['proCode'] = re.search(
             'lccp(.*?).png',
             product_pic.extract()[0]).group(1) if product_pic else ''
         yield item
     pass
示例#13
0
    def parse_product_detail(self, response):
        selector = Selector(response)
        item = BankproductItem()

        item['bankCode'] = 'hfb'
        item['channel'] = 'web'
        item['proCode'] = self.__get_xpath_value(
            response,
            "//table[@class='con2']/tbody/tr[1]/td[2]/text()").strip()
        item['proName'] = self.__get_xpath_value(
            response,
            "//table[@class='con2']/tbody/tr[1]/td[4]/text()").strip()
        item['openDate'] = self.__get_xpath_value(
            response,
            "//table[@class='con2']/tbody/tr[2]/td[2]/text()").strip()
        item['realEndDate'] = self.__get_xpath_value(
            response,
            "//table[@class='con2']/tbody/tr[2]/td[4]/text()").strip()
        item['currency'] = self.__get_xpath_value(
            response,
            "//table[@class='con2']/tbody/tr[3]/td[2]/text()").strip()
        item['riskLevel'] = self.__get_xpath_value(
            response,
            "//table[@class='con2']/tbody/tr[3]/td[4]/text()").strip()
        item['cycleTime'] = self.__get_xpath_value(
            response,
            "//table[@class='con2']/tbody/tr[5]/td[2]/text()").strip()
        item['endDate'] = self.__get_xpath_value(
            response,
            "//table[@class='con2']/tbody/tr[5]/td[4]/text()").strip()
        sellChannel = self.__get_xpath_value(
            response,
            "//table[@class='con2']/tbody/tr[6]/td[2]/text()").strip()
        item['sellChannel'] = re.sub('[\r\n\t\s]', '', sellChannel)
        item['incomeRateName'] = self.__get_xpath_value(
            response,
            "//div[@class='con1 of']/table/tbody/tr[1]/td[1]/text()").strip()
        item['incomeRate'] = self.__get_xpath_value(
            response,
            "//div[@class='con1 of']/table/tbody/tr[1]/td[2]/p/text()").strip(
            )
        item['firstAmount'] = self.__get_xpath_value(
            response,
            "//table[@class='con2']/tbody/tr[4]/td[2]/text()").strip()
        minSubUnit = self.__get_xpath_value(
            response,
            "//table[@class='con2']/tbody/tr[4]/td[4]/text()").strip()
        item['minSubUnit'] = re.search('((\d)+.(\d)+)', minSubUnit).group(0)

        # 产品说明书下载路径
        instructionUrl = self.__get_xpath_value(
            response, "//a[@class='download' and contains(@href,'说明书')]/@href")
        if instructionUrl:
            strs = self.start_url.split('/')
            num = len(strs)
            index = self.start_url.find(strs[num - 1], 0)
            item['instructionUrl'] = '{}{}'.format(self.start_url[0:index],
                                                   instructionUrl)
        # 风险说明书下载路径
        riskDisclosureUrl = self.__get_xpath_value(
            response, "//a[@class='download' and contains(@href,'风险')]/@href")
        if riskDisclosureUrl:
            strs = self.start_url.split('/')
            num = len(strs)
            index = self.start_url.find(strs[num - 1], 0)
            item['riskDisclosureUrl'] = '{}{}'.format(self.start_url[0:index],
                                                      riskDisclosureUrl)
        yield item
        pass
示例#14
0
    def parse(self, response):
        # 解析各个产品
        for product_item in response.xpath(
                "//*[@id='product_tab']//tr[@class='bg2']"):
            item = BankproductItem()
            item['bankCode'] = 'cgb'
            item['channel'] = 'web'

            item['proName'] = self.__get_xpath_value(
                product_item, "td[@class='name']/a/text()").strip()
            item_url = self.__get_xpath_value(
                product_item, "td[@class='name']/a/@href").strip()
            item['proCode'] = self.__get_re_value(item_url, 'productno=(.*)',
                                                  1)
            item['currency'] = self.__get_xpath_value(product_item,
                                                      "td[2]/text()").strip()
            item['cycleTime'] = self.__get_xpath_value(
                product_item, "td[3]/text()").strip().replace('&nbsp', '')
            item['firstAmount'] = self.__get_xpath_value(
                product_item, "td[4]/text()").strip()
            item['incomeRateName'] = self.__get_xpath_value(
                response, "//*[@id='product_tab']/tr[1]/th[5]/text()").strip()
            item['incomeRate'] = self.__get_xpath_value(
                product_item, "td[5]/b/text()").strip()
            item['riskLevel'] = self.__get_xpath_value(product_item,
                                                       "td[6]/text()").strip()

            recruitment_period = self.__get_xpath_value(
                product_item, "td[7]/text()").strip()
            if recruitment_period != '长&nbsp&nbsp&nbsp期':
                item['startDate'] = self.__get_re_value(
                    recruitment_period, "(.*?)至", 1)
                item['endDate'] = self.__get_re_value(recruitment_period,
                                                      "至(.*?)", 1)
                pass

            product_item_url = "{}{}".format(self.base_url, item_url)
            yield scrapy.Request(product_item_url,
                                 meta={'item': item},
                                 callback=self.parse_product_detail,
                                 dont_filter=True)

        # 是否存在下一页数据
        exist_data = response.xpath(
            "//*[@id='product_tab']//tr[@class='bg2']/td[@class='name']/a/@href"
        )
        if exist_data:
            # 当前页
            currPage = int(
                re.search('currPage=(\d+)',
                          str(response.request.body,
                              encoding='utf-8')).group(1))
            currPage = currPage + 1
            # 每页数量
            pageSize = int(
                re.search('rowsPerpage=(\d+)',
                          str(response.request.body,
                              encoding='utf-8')).group(1))
            # 每页开始
            turnPageBeginPos = int(
                re.search('turnPageBeginPos=(\d+)',
                          str(response.request.body,
                              encoding='utf-8')).group(1))
            turnPageBeginPos = turnPageBeginPos + pageSize
            self.form_data['currPage'] = str(currPage)
            self.form_data['turnPageBeginPos'] = str(turnPageBeginPos)
            yield scrapy.FormRequest(self.start_url,
                                     method="POST",
                                     formdata=self.form_data,
                                     dont_filter=True)
示例#15
0
 def parse_retail_open(self, response):
     selector = Selector(response)
     # 遍历产品类型
     for product_type in selector.xpath(
             "//*[@id='content']//div[@class='middle']/p"):
         # 找到兄弟节点
         table_selector = product_type.xpath("following-sibling::table[1]")
         title_num = len(table_selector.xpath("tbody/tr[1]").css("td"))
         proAttr = self.__get_xpath_value(product_type,
                                          "strong/text()").strip()
         # 遍历除了表头的元素
         for index, product_item in enumerate(
                 table_selector.xpath(
                     "tbody/tr[not(contains(td,'产品名称'))]")):
             item = BankproductItem()
             item['bankCode'] = 'cib'
             item['channel'] = 'web'
             item['proAttr'] = proAttr[0:proAttr.find('产品')]
             if title_num == 10:
                 item['proName'] = self.__get_xpath_value(
                     product_item, "td[1]/text()").strip()
                 item['startDate'] = self.__get_xpath_value(
                     product_item, "td[2]/text()").strip()
                 item['endDate'] = self.__get_xpath_value(
                     product_item, "td[3]/text()").strip()
                 item['sellArea'] = self.__get_xpath_value(
                     product_item, "td[4]/text()").strip()
                 item['currency'] = self.__get_xpath_value(
                     product_item, "td[5]/text()").strip()
                 item['cycleTime'] = self.__get_xpath_value(
                     product_item, "td[6]/text()").strip()
                 item['proType'] = self.__get_xpath_value(
                     product_item, "td[7]/text()").strip()
                 item['firstAmount'] = self.__get_xpath_value(
                     product_item, "td[8]/text()").strip()
                 item['incomeRateName'] = '业绩比较基准'
                 # 多个xpath路径可以一起使用
                 item['incomeRate'] = self.__get_xpath_value(
                     product_item,
                     "td[9]/strong/text()|td[9]/text()").strip()
                 product_pic = product_item.xpath('td[10]/img/@src')
                 item['proCode'] = re.search(
                     'lccp(.*?).png',
                     product_pic.extract()[0]).group(
                         1) if product_pic else ''
             else:
                 item['proName'] = self.__get_xpath_value(
                     product_item, "td[1]/text()").strip()
                 item['openTime'] = self.__get_xpath_value(
                     product_item, "td[2]/text()").strip()
                 item['sellArea'] = self.__get_xpath_value(
                     product_item, "td[3]/text()").strip()
                 item['currency'] = self.__get_xpath_value(
                     product_item, "td[4]/text()").strip()
                 item['cycleTime'] = self.__get_xpath_value(
                     product_item, "td[5]/text()").strip()
                 item['proType'] = self.__get_xpath_value(
                     product_item, "td[6]/text()").strip()
                 item['firstAmount'] = self.__get_xpath_value(
                     product_item, "td[7]/text()").strip()
                 item['incomeRateName'] = '业绩比较基准'
                 item['incomeRate'] = self.__get_xpath_value(
                     product_item,
                     "td[8]/strong/text()|td[8]/text()").strip()
                 product_pic = product_item.xpath('td[9]/img/@src')
                 item['proCode'] = re.search(
                     'lccp(.*?).png',
                     product_pic.extract()[0]).group(
                         1) if product_pic else ''
             pass
             yield item
     pass
示例#16
0
    def parse_product_detail(self, response):
        item = BankproductItem()

        item['bankCode'] = 'cmb'
        item['channel'] = 'web'
        proCode = self.__get_xpath_value(
            response, "//li[contains(text(),'产品代码')]/span/text()").strip()
        item['proCode'] = proCode

        item['proName'] = self.__get_xpath_value(
            response,
            "//td[contains(text(), '产品简称')]/following-sibling::td[1]/text()"
        ).strip()
        item['proAttr'] = self.__get_xpath_value(
            response, "//li[contains(text(),'产品类别')]/span/text()").strip()
        item['proType'] = self.__get_xpath_value(
            response, "//li[contains(text(),'投资类型')]/span/text()").strip()
        # item['sellObject'] = self.__get_xpath_value(response)
        # item['status'] = self.__get_xpath_value(response)
        item['currency'] = self.__get_xpath_value(
            response,
            "//td[contains(text(), '币种')]/following-sibling::td[1]/text()"
        ).strip()
        # item['crFlag'] = self.__get_xpath_value(response)
        # item['cycleTime'] = self.__get_xpath_value(response)
        # item['incomeRateName'] = self.__get_xpath_value(response)
        # item['incomeRate'] = self.__get_xpath_value(response)
        # item['nextIncomeRate'] = self.__get_xpath_value(response)
        # item['interestType'] = self.__get_xpath_value(response)
        item['riskLevel'] = self.__get_xpath_value(
            response, "//li[contains(text(),'风险评级')]/span/text()").strip()
        # item['redRule'] = self.__get_xpath_value(response)
        # item['buyRule'] = self.__get_xpath_value(response)
        item['startDate'] = self.__get_xpath_value(
            response, "//li[contains(text(),'发售起始日期')]/span/text()").strip()
        item['endDate'] = self.__get_xpath_value(
            response, "//li[contains(text(),'发售截止日期')]/span/text()").strip()
        # item['openDate'] = self.__get_xpath_value(response)
        # nextOpenDate = self.__get_xpath_value(response)
        # nextEndDate = self.__get_xpath_value(response)
        item['realEndDate'] = self.__get_xpath_value(
            response, "//li[contains(text(),'产品到期日')]/span/text()").strip()
        # openTime = self.__get_xpath_value(response)
        # closeTime = self.__get_xpath_value(response)
        # proManager = self.__get_xpath_value(response)
        # sellArea = self.__get_xpath_value(response)
        item['sellChannel'] = self.__get_xpath_value(
            response, "//li[contains(text(),'销售渠道')]/span/text()").strip()
        # currentPurchases = self.__get_xpath_value(response)
        item['firstAmount'] = self.__get_xpath_value(
            response,
            "//td[contains(text(), '认购价格')]/following-sibling::td[1]/text()"
        ).strip()
        item['firstSubMinAmount'] = self.__get_xpath_value(
            response,
            "//td[contains(text(), '首次认购下限')]/following-sibling::td[1]/text()"
        ).strip()
        item['minPurBalance'] = self.__get_xpath_value(
            response,
            "//td[contains(text(), '最低申购余额')]/following-sibling::td[1]/text()"
        ).strip()
        item['minRedBalance'] = self.__get_xpath_value(
            response,
            "//td[contains(text(), '最低赎回余额')]/following-sibling::td[1]/text()"
        ).strip()
        item['minSubUnit'] = self.__get_xpath_value(
            response,
            "//td[contains(text(), '认购基数')]/following-sibling::td[1]/text()"
        ).strip()
        item['minPurUnit'] = self.__get_xpath_value(
            response,
            "//td[contains(text(), '申购基数')]/following-sibling::td[1]/text()"
        ).strip()
        item['minRedUnit'] = self.__get_xpath_value(
            response,
            "//td[contains(text(), '赎回基数')]/following-sibling::td[1]/text()"
        ).strip()
        item['maxSingleSub'] = self.__get_xpath_value(
            response,
            "//td[contains(text(), '认购单笔上限')]/following-sibling::td[1]/text()"
        ).strip()
        item['maxSinglePur'] = self.__get_xpath_value(
            response,
            "//td[contains(text(), '申购单笔上限')]/following-sibling::td[1]/text()"
        ).strip()
        item['maxSingleRed'] = self.__get_xpath_value(
            response,
            "//td[contains(text(), '赎回单笔上限')]/following-sibling::td[1]/text()"
        ).strip()
        item['minSingleSub'] = self.__get_xpath_value(
            response,
            "//td[contains(text(), '认购单笔下限')]/following-sibling::td[1]/text()"
        ).strip()
        item['minSinglePur'] = self.__get_xpath_value(
            response,
            "//td[contains(text(), '申购单笔上限')]/following-sibling::td[1]/text()"
        ).strip()
        item['minSingleRed'] = self.__get_xpath_value(
            response,
            "//td[contains(text(), '赎回单笔上限')]/following-sibling::td[1]/text()"
        ).strip()
        # maxOneDaySub = self.__get_xpath_value(response)
        # plainHold = self.__get_xpath_value(response)
        # proNetValue = self.__get_xpath_value(response)
        # allowedResRed = self.__get_xpath_value(response)
        # allowedRelRed = self.__get_xpath_value(response)
        item['overviewUrl'] = '{}{}{}'.format(
            "http://www.cmbchina.com/cfweb/Personal/productdetail.aspx?code=",
            proCode, "&type=prodintro")
        # item['overviewDownloadUrl'] = ''
        item['infoUrl'] = '{}{}{}'.format(
            "http://www.cmbchina.com/cfweb/Personal/productdetail.aspx?code=",
            proCode, "&type=prodinfo")
        # item['infoDownloadUrl'] = ''
        item['noticeUrl'] = '{}{}{}'.format(
            "http://www.cmbchina.com/cfweb/Personal/productdetail.aspx?code=",
            proCode, "&type=prodnotice")
        # item['noticeDownloadUrl'] = ''
        item['netWorthUrl'] = '{}{}{}'.format(
            "http://www.cmbchina.com/cfweb/Personal/productdetail.aspx?code=",
            proCode, "&type=prodvalue")
        # item['netWorthDownloadUrl'] = ''
        item['reportUrl'] = '{}{}{}'.format(
            "http://www.cmbchina.com/cfweb/Personal/productdetail.aspx?code=",
            proCode, "&type=prodir")
        # item['reportDownloadUrl'] = ''
        item['commentUrl'] = '{}{}{}'.format(
            "http://www.cmbchina.com/cfweb/Personal/productdetail.aspx?code=",
            proCode, "&type=prodcomment")
        # item['commentDownloadUrl'] = ''
        item['instructionUrl'] = '{}{}{}'.format(
            "http://www.cmbchina.com/cfweb/Personal/productdetail.aspx?code=",
            proCode, "&type=prodexplain")
        # item['instructionDownloadUrl'] = ''
        item['riskDisclosureUrl'] = '{}{}{}'.format(
            "http://www.cmbchina.com/cfweb/Personal/productdetail.aspx?code=",
            proCode, "&type=prodrisk")
        # item['riskDisclosureDownloadUrl'] = ''

        yield scrapy.Request(item['instructionUrl'],
                             meta={'item': item},
                             callback=self.parse_product_detail_2,
                             dont_filter=True)
示例#17
0
    def parse_product_detail(self, response):
        item = BankproductItem()

        item['bankCode'] = 'citic'
        item['channel'] = 'web'
        item['proCode'] = self.__get_xpath_value(response, "//td[text()='产品代码']/../td[2]/text()").strip()
        item['proName'] = self.__get_re_value(response.text, "<div class=\"title_l\">(.*?)<span>", 1).strip()
        cycleTime = self.__get_xpath_value(response, "//td[text()='产品期限']/../td[2]/script/text()").strip()
        item['cycleTime'] = self.__get_re_value(cycleTime, "'(.*?)'", 1) + '天'
        item['firstAmount'] = self.__get_xpath_value(response, "//td[text()='购买起点']/../td[2]/@startpoint").strip()

        # 币种
        currency_flag = self.__get_xpath_value(response, "//span[text()='币种']/../span[2]/@curr_type")
        if currency_flag == '001':
            item['currency'] = "人民币"
        elif currency_flag == '014':
            item['currency'] = "美元"
        pass

        # 风险等级
        riskLevel_flag = self.__get_xpath_value(response, "//span[text()='风险等级']/../span[2]/@risklevel")
        if riskLevel_flag == '0':
            item['riskLevel'] = '无风险'
        elif riskLevel_flag == '1':
            item['riskLevel'] = '低风险'
        elif riskLevel_flag == '2':
            item['riskLevel'] = '较低风险'
        elif riskLevel_flag == '3':
            item['riskLevel'] = '中等风险'
        elif riskLevel_flag == '4':
            item['riskLevel'] = '较高风险'
        elif riskLevel_flag == '5':
            item['riskLevel'] = '高风险'

        # 产品状态
        status_flag = self.__get_xpath_value(response, "//span[text()='产品状态']/../span[2]/@prod_state")
        if status_flag == '0':
            item['status'] = '开放期'
        elif status_flag == '1':
            item['status'] = '募集期'
        elif status_flag == '3':
            item['status'] = '发行失败'
        elif status_flag == '4':
            item['status'] = '停止交易'

        # 管理机构
        proManager_code = self.__get_xpath_value(response, "//span[text()='管理机构']/../span[2]/@prdmanager")
        if proManager_code == '008':
            item['proManager'] = '中信银行'

        item['openDate'] = self.__get_xpath_value(response, "//span[text()='起息日']/../span[2]/text()").strip()
        item['realEndDate'] = self.__get_xpath_value(response, "//span[text()='到期日']/../span[2]/text()").strip()
        item['nextOpenDate'] = self.__get_xpath_value(response, "//span[text()='下一开放日']/../span[2]/text()").strip()

        # 销售对象
        sellObject = self.__get_xpath_value(response, "//span[text()='产品面向客户群']/../span[2]/script/text()")
        sellObject = self.__get_re_value(sellObject, '"(.*?)"', 1)
        item['sellObject'] = sellObject.replace('0', "个人普通客户 ").replace('1', "个人金卡客户 ") \
            .replace('2', "个人白金客户 ").replace('4', "个人钻石客户")

        # 销售区域
        item['sellArea'] = self.__get_xpath_value(response, "//span[text()='销售区域']/../span[2]/text()").strip()
        item['incomeRate'] = self.__get_xpath_value(response, "//div[@class='col-lg-4 col-md-4 col-sm-4  lc_text_m']/div/span/@finagains")

        proNetValue = self.__get_xpath_value(response, "//span[text()='产品净值']/../span[2]/script/text()")
        item['proNetValue'] = self.__get_re_value(proNetValue, '"(.*?)"', 1)

        maxSingleRed = self.__get_xpath_value(response, "//span[text()='赎回单笔上限']/../span[2]/script/text()")
        item['maxSingleRed'] = '0.00' if self.__get_re_value(maxSingleRed, '"(.*?)"', 1) == '' \
            else self.__get_re_value(maxSingleRed, '"(.*?)"', 1)

        minSingleRed = self.__get_xpath_value(response, "//span[text()='赎回单笔下限']/../span[2]/script/text()")
        item['minSingleRed'] = '0.00' if self.__get_re_value(minSingleRed, '"(.*?)"', 1) == '' \
            else self.__get_re_value(minSingleRed, '"(.*?)"', 1)

        maxSingleSub = self.__get_xpath_value(response, "//span[text()='认购单笔上限']/../span[2]/script/text()")
        item['maxSingleSub'] = '0.00' if self.__get_re_value(maxSingleSub, '"(.*?)"', 1) == '' \
            else self.__get_re_value(maxSingleSub, '"(.*?)"', 1)

        minSingleSub = self.__get_xpath_value(response, "//span[text()='认购单笔下限']/../span[2]/script/text()")
        item['minSingleSub'] = '0.00' if self.__get_re_value(minSingleSub, '"(.*?)"', 1) == '' \
            else self.__get_re_value(minSingleSub, '"(.*?)"', 1)

        maxSinglePur = self.__get_xpath_value(response, "//span[text()='申购单笔上限']/../span[2]/script/text()")
        item['maxSinglePur'] = '0.00' if self.__get_re_value(maxSinglePur, '"(.*?)"', 1) == '' \
            else self.__get_re_value(maxSinglePur, '"(.*?)"', 1)

        minSinglePur = self.__get_xpath_value(response, "//span[text()='申购单笔下限']/../span[2]/script/text()")
        item['minSinglePur'] = '0.00' if self.__get_re_value(minSinglePur, '"(.*?)"', 1) == ''\
            else self.__get_re_value(minSinglePur, '"(.*?)"', 1)

        minRedUnit = self.__get_xpath_value(response, "//span[text()='赎回基数']/../span[2]/script/text()")
        item['minRedUnit'] = '0.00' if self.__get_re_value(minRedUnit, '"(.*?)"', 1) == '' \
            else self.__get_re_value(minRedUnit, '"(.*?)"', 1)

        minSubUnit = self.__get_xpath_value(response, "//span[text()='认购基数']/../span[2]/script/text()")
        item['minSubUnit'] = '0.00' if self.__get_re_value(minSubUnit, '"(.*?)"', 1) == '' \
            else self.__get_re_value(minSubUnit, '"(.*?)"', 1)

        minPurUnit = self.__get_xpath_value(response, "//span[text()='申购基数']/../span[2]/script/text()")
        item['minPurUnit'] = '0.00' if self.__get_re_value(minPurUnit, '"(.*?)"', 1) == '' \
            else self.__get_re_value(minPurUnit, '"(.*?)"', 1)

        allowedResRed = self.__get_xpath_value(response, "//span[text()='是否允许预约赎回']/../span[2]/@dataisbit")
        item['allowedResRed'] = allowedResRed.replace('0', "否").replace('1', "是")

        allowedRelRed = self.__get_xpath_value(response, "//span[text()='是否允许实时赎回']/../span[2]/@dataisbit")
        item['allowedRelRed'] = allowedRelRed.replace('0', "否").replace('1', "是")

        item['instructionUrl'] = self.__get_xpath_value(response, "//div[@class='title_r']/ul/li[1]/a/@href")

        yield item