Python insert_value 예제들, deya_research_spider.tools.insert_value Python 예제들

예제 #1

0

파일 보기

    def parse(self, response, **kwargs):
        date = response.meta.get('date')
        row = response.xpath(
            "//div[@id='content']//tbody/tr[2]/td[1]/@rowspan").get()
        row = int(row)
        tr_list = response.xpath(
            f"//div[@id='content']//tbody/tr[position()>1][position()<{row + 1}]"
        )
        prices = []
        price = None
        for tr in tr_list:
            text_list = tr.xpath('.//text()').extract()
            for i in text_list:
                if re.findall(r"^(\d\d\d\d)$", i):
                    price = re.findall(r"^(\d\d\d\d)$", i)[0]

            try:
                # 有缺失值为"-"，报异常，跳过
                prices.append(int(price))
            except Exception as e:
                pass
        price = min(prices)
        # print('*' * 50)
        # print(date)
        # print(prices)
        # print('*' * 50)
        insert_value(date, price, self.index_id)

예제 #2

0

파일 보기

파일: spider_18.py 프로젝트: pippichi/scrapy

 def parse(self, response, **kwargs):
     try:
         content = json.loads(response.text)
         obj_id = response.meta['obj_id']
         if self.mode == 1:
             formdata = response.meta['formdata']
             pages = content['pages']
             for c in content['pageInfo']['list']:
                 date = c['indexDate'].replace("/", "-")
                 value = c['indexValue']
                 insert_value(date, value, obj_id)
             for p in range(2, int(pages)+1):
                 formdata['pageNumber'] = str(p)
                 yield FormRequest(
                     url='https://dc.oilchem.net/price_search/history.htm',
                     formdata=formdata,
                     callback=self.parse_next_page,
                     dont_filter=True,
                     headers=self.get_headers2(host=self.host_dc_oilchem),
                     meta={'obj_id': obj_id, 'formdata': formdata}
                 )
                 time.sleep(1 + int(random.uniform(5, 10)))
         else:
             date = content['pageInfo']['list'][0]['indexDate'].replace("/", "-")
             if date != datetime.date.today().strftime("%Y-%m-%d"):
                 print("今日" + self.id_name[obj_id] + "还没出来")
             else:
                 value = content['pageInfo']['list'][0]['indexValue']
                 insert_value(date, value, obj_id)
     except Exception as e:
         print('!' * 30)
         print(e)
         print('!' * 30)

예제 #3

0

파일 보기

    def parse(self, response, **kwargs):
        date = response.meta.get('date')
        title = response.xpath(
            "//div[@id='content']//tr[@class='firstRow']/td//span/text()"
        ).getall()
        rows = response.xpath(
            "//div[@id='content']//tr[not(contains(@class, 'firstRow'))]"
        ).getall()
        value = Decimal(0)
        for row in rows:
            data = parsel.Selector(row).xpath("//td").extract()
            if len(data) == len(title):
                index1, index2 = 2, 3
            else:
                index1, index2 = 1, 2
            capacity = parsel.Selector(
                data[index1]).xpath("//span/text()").extract_first()
            rate = parsel.Selector(
                data[index2]).xpath("//span/text()").extract_first()
            if capacity is None or rate is None:
                continue
            else:
                capacity_num = Decimal(capacity)
                rate_num = Decimal(rate[:-1]) / 100
                value += capacity_num * rate_num
        value = (value / 52).quantize(Decimal('0'))

        insert_value(date, value, self.index_id)

예제 #4

0

파일 보기

    def parse(self, response, **kwargs):
        date = response.meta.get('date')
        sentence = response.xpath(
            "//div[@id='content']//p/text()").extract_first()
        value = Decimal(re.search(r"(?<=约)\d+(\.\d+)?", sentence).group())

        insert_value(date, value, self.index_id)

예제 #5

0

파일 보기

파일: spider_8_9.py 프로젝트: pippichi/scrapy

 def parse(self, response, **kwargs):
     date = response.meta.get('date')
     # 开工率指标匹配
     sentence = response.xpath(
         "//div[@id='content']//p/text()").extract_first()
     rate_value = Decimal(
         re.search(r"(?<=开工率)\d+(\.\d+)?", sentence).group())
     # 国内产量指标计算
     title = response.xpath(
         "//div[@id='content']//tr[@class='firstRow']/td//span/text()"
     ).getall()
     rows = response.xpath(
         "//div[@id='content']//tr[not(contains(@class, 'firstRow'))]"
     ).getall()
     capacity_value = Decimal(0)
     for row in rows:
         data = parsel.Selector(row).xpath("//td").extract()
         if len(data) == len(title):
             index1, index2 = 2, 3
         else:
             index1, index2 = 1, 2
         capacity = parsel.Selector(
             data[index1]).xpath("//span/text()").extract_first()
         rate = parsel.Selector(
             data[index2]).xpath("//span/text()").extract_first()
         if capacity is None or rate is None:
             continue
         else:
             capacity_num = Decimal(capacity)
             rate_num = Decimal(rate.replace('%', '')) / 100
             capacity_value += capacity_num * rate_num
     capacity_value = (capacity_value / 52).quantize(Decimal('0'))
     insert_value(date, rate_value, self.rate_index_id)
     insert_value(date, capacity_value, self.capacity_index_id)

예제 #6

0

파일 보기

파일: spider_1_2.py 프로젝트: pippichi/scrapy

 def parse(self, response, **kwargs):
     date = response.meta.get('date')
     res = response.text
     a = re.findall('(\d+)-(\d+)元', str(res))
     sd_price = Decimal(a[-2][0])
     hd_price = Decimal(a[-1][0])
     insert_value(date, sd_price, self.sd_index_id)
     insert_value(date, hd_price, self.hd_index_id)

예제 #7

0

파일 보기

파일: spider_19.py 프로젝트: pippichi/scrapy

 def parse(self, response, **kwargs):
     date = response.meta.get('date')
     val = []
     tr = response.css('#content > table > tbody > tr')
     row_count = int(
         response.xpath(
             '//*[@id="content"]/table/tbody/tr[2]/td[1]/@rowspan').get())
     count_1 = 1
     tmp_1 = response.xpath(
         '//*[@id="content"]/table/tbody/tr[2]/td[2]/@rowspan').get()
     if tmp_1 is not None:
         count_1 = int(tmp_1)
     count_2 = 1
     tmp_2 = response.xpath(
         '//*[@id="content"]/table/tbody/tr[{}]/td[1]/@rowspan'.format(
             count_1 + 1 + 1)).get()
     if tmp_2 is not None:
         count_2 = int(tmp_2)
     count_3 = 1
     tmp_3 = response.xpath(
         '//*[@id="content"]/table/tbody/tr[{}]/td[1]/@rowspan'.format(
             count_1 + count_2 + 1 + 1)).get()
     if tmp_3 is not None:
         count_3 = int(tmp_3)
     count_4 = 1
     tmp_4 = response.xpath(
         '//*[@id="content"]/table/tbody/tr[{}]/td[1]/@rowspan'.format(
             count_1 + count_2 + count_3 + 1 + 1)).get()
     if tmp_4 is not None:
         count_4 = int(tmp_4)
     # 茂名石化
     for i in tr[1:count_1 + 1]:
         temp = i.css('td:nth-child(2) > p > span::text').get()
         if re.fullmatch(r'\d*', temp) is None:
             temp = i.css('td:nth-child(4) > p > span::text').get()
         val.append(temp)
     # 广州石化
     for j in tr[count_1 + 1:count_1 + count_2 + 1]:
         temp = j.css('td:nth-child(2) > p > span::text').get()
         if re.fullmatch(r'\d*', temp) is None:
             temp = j.css('td:nth-child(3) > p > span::text').get()
         val.append(temp)
     # 福建联合
     for k in tr[count_1 + count_2 + 1:count_1 + count_2 + count_3 + 1]:
         temp = j.css('td:nth-child(2) > p > span::text').get()
         if re.fullmatch(r'\d*', temp) is None:
             temp = j.css('td:nth-child(3) > p > span::text').get()
         if j.css('td:first-child > p > span::text').get() == 'HPPSS':
             continue
         val.append(temp)
     # 跳过海南炼厂
     for q in tr[count_1 + count_2 + count_3 + count_4 + 1:row_count + 1]:
         temp = j.css('td:nth-child(2) > p > span::text').get()
         if re.fullmatch(r'\d*', temp) is None:
             temp = j.css('td:nth-child(3) > p > span::text').get()
         val.append(temp)
     value = Decimal(min(val))
     insert_value(date, value, self.index_id)

예제 #8

0

파일 보기

파일: spider_19.py 프로젝트: pippichi/scrapy

 def parse(self, response, **kwargs):
     date = response.meta.get('date')
     val = []
     tr = response.css('#content > table > tbody > tr')
     for i in tr:
         factory = i.css('td:first-child > p > span::text').get()
         if factory is not None and factory[0:2] in self.factory_list:
             val.append(i.css('td:nth-child(4) > p > span::text').get())
     value = Decimal(min(val))
     insert_value(date, value, self.index_id)

예제 #9

0

파일 보기

파일: spider_3.py 프로젝트: pippichi/scrapy

 def parse(self, response, **kwargs):
     date = response.meta.get('date')
     val = response.css('div#content tbody > tr:last-child td:last-child::text').get()
     # 2019-06-14 之前的DOM结构
     if val is None:
         val = response.css('div#content tbody > tr:nth-child(11) > td:nth-child(4) > strong::text').get()
     # 2018-10-26 之前的DOM结构
     if val is None:
         val = response.css('div#content tbody > tr:last-child > td:last-child > p::text').get()
     value = Decimal(val)
     insert_value(date, value, self.index_id)

예제 #10

0

파일 보기

파일: spider_18.py 프로젝트: pippichi/scrapy

 def parse_next_page(self, response):
     try:
         content = json.loads(response.text)
         obj_id = response.meta['obj_id']
         if self.mode == 1:
             for c in content['pageInfo']['list']:
                 date = c['indexDate'].replace("/", "-")
                 value = c['indexValue']
                 insert_value(date, value, obj_id)
         else:
             pass
     except Exception as e:
         print('!' * 30)
         print(e)
         print('!' * 30)

예제 #11

0

파일 보기

    def parse(self, response, **kwargs):
        date = response.meta.get('date')
        row = response.xpath(
            "//div[@id='content']//tbody/tr[2]/td[1]/@rowspan").get()
        row = int(row)
        tr_list = response.xpath(
            f"//div[@id='content']//tbody/tr[position()>1][position()<{row+1}]"
        )
        prices = []
        count = 1
        for tr in tr_list:
            if count == 1:
                price = tr.xpath('./td[5]//text()').extract_first()
            else:
                price = tr.xpath('./td[4]//text()').extract_first()
            # 26页之后特殊情况：将数字拆分到不同的标签
            if int(price) < 1000:
                if count == 1:
                    price = tr.xpath('./td[5]//text()').extract()
                else:
                    price = tr.xpath('./td[4]//text()').extract()
                price = ''.join(price)

            # 2019-9-9开始数据的列发生变化
            if date <= "2019-09-09":
                if count == 1:
                    price = tr.xpath('./td[4]//text()').extract_first()
                else:
                    price = tr.xpath('./td[3]//text()').extract_first()
                # 26页之后特殊情况：将数字拆分到不同的标签
                if int(price) < 1000:
                    if count == 1:
                        price = tr.xpath('./td[4]//text()').extract()
                    else:
                        price = tr.xpath('./td[3]//text()').extract()
                    price = ''.join(price)

            count += 1
            try:
                # 有缺失值为"-"，报异常，跳过
                prices.append(int(price))
            except Exception as e:
                pass
        price = min(prices)
        insert_value(date, price, self.index_id)

예제 #12

0

파일 보기

    def parse(self, response, **kwargs):
        date = response.meta.get('date')
        rows = response.xpath(
            "//div[@id='content']//tr[not(contains(@class, 'firstRow'))]"
        ).getall()
        for row in rows:
            production, capacity, rate = parsel.Selector(row).xpath(
                "//td/text()").extract()
            if production == '辛醇':
                capacity_num = Decimal(
                    re.search('^(\-|\+)?\d+(\.\d+)?', capacity).group())
                rate_num = Decimal(
                    re.search('^(\-|\+)?\d+(\.\d+)?', rate).group())
                value = ((capacity_num * rate_num / 10) / 52).quantize(
                    Decimal('0'))

                insert_value(date, value, self.index_id)
                break

예제 #13

0

파일 보기

 def parse(self, response, **kwargs):
     date = response.meta.get('date')
     row = response.xpath(
         "//div[@id='content']//tbody/tr[2]/td[1]/@rowspan").get()
     row = int(row)
     tr_list = response.xpath(
         f"//div[@id='content']//tbody/tr[position()>1][position()<{row+1}]"
     )
     prices = []
     count = 1
     for tr in tr_list:
         if count == 1:
             price = tr.xpath('./td[5]//text()').extract_first()
         else:
             price = tr.xpath('./td[4]//text()').extract_first()
         count += 1
         prices.append(int(price))
     price = min(prices)
     insert_value(date, price, self.index_id)

예제 #14

0

파일 보기

파일: spider_14.py 프로젝트: pippichi/scrapy

    def parse(self, response, **kwargs):
        date = response.meta.get('date')
        draw_p = response.css('#content > p:nth-child(24) > span::text').get()
        fiber_p = response.css('#content > p:nth-child(25) > span::text').get()
        merchant_p = response.css(
            '#content > p:nth-child(31) > span::text').get()
        plastic_woven_p = response.css(
            '#content > p:nth-child(48) > span::text').get().replace(" ", "")
        bopp_p = response.css('#content > p:nth-child(54) > span::text').get()
        draw_value = Decimal(
            re.search(r"(?<=企业库存在)\d+(\.\d+)?", draw_p).group())
        fiber_value = Decimal(
            re.search(r"(?<=企业库存在)\d+(\.\d+)?", fiber_p).group())

        merchant_re = re.search(r"(?<=贸易商库存较上周增加)\d+(\.\d+)?", merchant_p)
        if merchant_re:
            merchant_value = Decimal(merchant_re.group())
        else:
            merchant_value = -Decimal(
                re.search(r"(?<=贸易商库存较上周[\u4e00-\u9fa5]{2})\d+(\.\d+)?",
                          merchant_p).group())

        plastic_woven_re = re.search(r"(?<=原料库存天数较上周上涨)\d+(\.\d+)?",
                                     plastic_woven_p)
        if plastic_woven_re:
            plastic_woven_value = Decimal(plastic_woven_re.group())
        else:
            plastic_woven_value = -Decimal(
                re.search(r"(?<=原料库存天数较上周[\u4e00-\u9fa5]{2})\d+(\.\d+)?",
                          plastic_woven_p).group())

        bopp_re = re.search(r"(?<=BOPP原料库存较上周上涨)\d+(\.\d+)?", bopp_p)
        if bopp_re:
            bopp_value = Decimal(bopp_re.group())
        else:
            bopp_value = -Decimal(
                re.search(r"(?<=BOPP原料库存较上周[\u4e00-\u9fa5]{2})\d+(\.\d+)?",
                          bopp_p).group())
        insert_value(date, draw_value, self.draw_index_id)
        insert_value(date, fiber_value, self.fiber_index_id)
        insert_value(date, merchant_value, self.merchant_index_id)
        insert_value(date, plastic_woven_value, self.plastic_woven_index_id)
        insert_value(date, bopp_value, self.bopp_index_id)

예제 #15

0

파일 보기

파일: spider_19.py 프로젝트: pippichi/scrapy

 def parse(self, response, **kwargs):
     date = response.meta.get('date')
     val = []
     row_count = int(
         response.xpath(
             '//*[@id="content"]/table/tbody/tr[2]/td[1]/@rowspan').get())
     count = 1
     tr = response.css('#content > table > tbody > tr')
     for i in tr[1:row_count + 1]:
         company = i.xpath('td[1]/p/span/text()').get()
         if company[0:2] == '中天':
             tmp = i.xpath('td[1]/@rowspan').get()
             if tmp is not None:
                 count = int(tmp)
     for j in tr[1:row_count - count + 1]:
         temp = j.css('td:nth-child(3) > p > span::text').get()
         if re.fullmatch(r'\d*', temp) is None:
             temp = j.css('td:nth-child(4) > p > span::text').get()
         val.append(temp)
     value = Decimal(min(val))
     insert_value(date, value, self.index_id)

예제 #16

0

파일 보기

    def parse(self, response, **kwargs):
        try:
            title = response.xpath(
                '//*[@id="content"]/table/tbody/tr[@class="firstRow"]/td//text()'
            ).getall()
            if not '用途' in title:
                pass
            else:
                # 经过观察，表的第一行里面要么有“产能”，要么有“产能（万吨/年）”
                flag = 0 if '产能' in title else 1
                date = response.xpath(
                    "/html/body/div[8]/div[2]/div[1]/div[1]/div[1]/span/text()"
                ).get()
                date = parse_date(date)
                temp = response.xpath(
                    "//*[@id='content']/table/tbody/tr[not(contains(@class, 'firstRow'))]"
                )
                content = []
                for t in temp:
                    td = t.xpath('.//td')
                    temp_list = []
                    for t_td in td:
                        text = t_td.xpath(".//text()").getall()
                        temp_str = "".join(text)
                        temp_list.append(temp_str)
                    content.append(temp_list)
                wb = Workbook()
                wb.remove(wb['Sheet'])
                st = wb.create_sheet(self.sheet_name)
                row = 2
                last_name_col1, last_name_col2, last_name_col3 = None, None, None

                for ti in range(len(title)):
                    st.cell(1, ti + 1).value = title[ti]

                for c in content:
                    if len(c) == len(title):
                        last_name_col1 = c[0]
                        last_name_col2 = c[1]
                        last_name_col3 = c[2]
                        for col in range(len(c)):
                            st.cell(row, col + 1).value = c[col]
                    elif len(c) == len(title) - 1 and len(title) - 1 > 0:
                        st.cell(row, 1).value = last_name_col1
                        last_name_col2 = c[0]
                        last_name_col3 = c[1]
                        for col in range(len(c)):
                            st.cell(row, col + 2).value = c[col]
                    elif len(c) == len(title) - 2 and len(title) - 2 > 0:
                        st.cell(row, 1).value = last_name_col1
                        st.cell(row, 2).value = last_name_col2
                        last_name_col3 = c[0]
                        for col in range(len(c)):
                            st.cell(row, col + 3).value = c[col]
                    else:
                        st.cell(row, 1).value = last_name_col1
                        st.cell(row, 2).value = last_name_col2
                        st.cell(row, 3).value = last_name_col3
                        for col in range(len(c)):
                            st.cell(row, col + 4).value = c[col]
                    row += 1
                wb.save(date + '.xlsx')

                ex_pd = pd.read_excel(date + '.xlsx',
                                      sheet_name=self.sheet_name)
                if flag == 0:
                    ex_pd_c = ex_pd['产能']
                    ex_pd_c_filter = ex_pd[ex_pd['用途'] == '--']['产能']
                else:
                    ex_pd_c = ex_pd['产能（万吨/年）']
                    ex_pd_c_filter = ex_pd[ex_pd['用途'] == '--']['产能（万吨/年）']
                capacity_of_production = Decimal(sum(ex_pd_c)).quantize(
                    Decimal("0.0000"))
                repair = Decimal(sum(ex_pd_c_filter) / 365).quantize(
                    Decimal("0.0000"))
                capacity_sub_repair = Decimal(capacity_of_production / 365 -
                                              repair).quantize(
                                                  Decimal("0.0000"))

                insert_value(date, capacity_of_production,
                             self.capacity_of_production_id)
                insert_value(date, capacity_sub_repair,
                             self.capacity_sub_repair_id)
                insert_value(date, repair, self.repair_id)
        except Exception as e:
            print('!' * 30)
            print('step2')
            print(e)
            print('!' * 30)

예제 #17

0

파일 보기

파일: spider_13.py 프로젝트: pippichi/scrapy

 def parse(self, response, **kwargs):
     date = response.meta.get('date')
     p = response.css('#content > p::text').get()
     value = Decimal(re.search(r"(?<=两油库存)\d+(\.\d+)?", p).group())
     insert_value(date, value, self.index_id)

예제 #18

0

파일 보기

    def parse(self, response, **kwargs):
        date = response.meta.get('date')
        # 进口数据
        import_1 = response.xpath(
            "//div[@id='content']//tr[2]/td[2]/text()").extract_first()
        import_2 = response.xpath(
            "//div[@id='content']//tr[2]/td[3]/text()").extract_first()
        import_3 = response.xpath(
            "//div[@id='content']//tr[2]/td[4]/text()").extract_first()
        import_junju = Decimal(import_1)
        import_gongju = Decimal(import_2) + Decimal(import_3)
        import_total = Decimal(import_1) + Decimal(import_2) + Decimal(
            import_3)

        # 出口数据
        export_1 = response.xpath(
            "//div[@id='content']//tr[2]/td[6]/text()").extract_first()
        export_2 = response.xpath(
            "//div[@id='content']//tr[2]/td[7]/text()").extract_first()
        export_3 = response.xpath(
            "//div[@id='content']//tr[2]/td[8]/text()").extract_first()
        export_junju = Decimal(export_1)
        export_gongju = Decimal(export_2) + Decimal(export_3)
        export_total = Decimal(export_1) + Decimal(export_2) + Decimal(
            export_3)

        # 插入
        insert_value(date, import_junju, self.import_junju_index_id)
        insert_value(date, import_gongju, self.import_gongju_index_id)
        insert_value(date, import_total, self.import_total_index_id)
        insert_value(date, export_junju, self.export_junju_index_id)
        insert_value(date, export_gongju, self.export_gongju_index_id)
        insert_value(date, export_total, self.export_total_index_id)

예제 #19

0

파일 보기

파일: spider_3.py 프로젝트: pippichi/scrapy

 def parse(self, response, **kwargs):
     date = response.meta.get('date')
     value = Decimal(response.css('div#content tbody > tr:last-child td:last-child::text').get())
     insert_value(date, value, self.index_id)