def parse(self, response, **kwargs): date = response.meta.get('date') row = response.xpath( "//div[@id='content']//tbody/tr[2]/td[1]/@rowspan").get() row = int(row) tr_list = response.xpath( f"//div[@id='content']//tbody/tr[position()>1][position()<{row + 1}]" ) prices = [] price = None for tr in tr_list: text_list = tr.xpath('.//text()').extract() for i in text_list: if re.findall(r"^(\d\d\d\d)$", i): price = re.findall(r"^(\d\d\d\d)$", i)[0] try: # 有缺失值为"-",报异常,跳过 prices.append(int(price)) except Exception as e: pass price = min(prices) # print('*' * 50) # print(date) # print(prices) # print('*' * 50) insert_value(date, price, self.index_id)
def parse(self, response, **kwargs): try: content = json.loads(response.text) obj_id = response.meta['obj_id'] if self.mode == 1: formdata = response.meta['formdata'] pages = content['pages'] for c in content['pageInfo']['list']: date = c['indexDate'].replace("/", "-") value = c['indexValue'] insert_value(date, value, obj_id) for p in range(2, int(pages)+1): formdata['pageNumber'] = str(p) yield FormRequest( url='https://dc.oilchem.net/price_search/history.htm', formdata=formdata, callback=self.parse_next_page, dont_filter=True, headers=self.get_headers2(host=self.host_dc_oilchem), meta={'obj_id': obj_id, 'formdata': formdata} ) time.sleep(1 + int(random.uniform(5, 10))) else: date = content['pageInfo']['list'][0]['indexDate'].replace("/", "-") if date != datetime.date.today().strftime("%Y-%m-%d"): print("今日" + self.id_name[obj_id] + "还没出来") else: value = content['pageInfo']['list'][0]['indexValue'] insert_value(date, value, obj_id) except Exception as e: print('!' * 30) print(e) print('!' * 30)
def parse(self, response, **kwargs): date = response.meta.get('date') title = response.xpath( "//div[@id='content']//tr[@class='firstRow']/td//span/text()" ).getall() rows = response.xpath( "//div[@id='content']//tr[not(contains(@class, 'firstRow'))]" ).getall() value = Decimal(0) for row in rows: data = parsel.Selector(row).xpath("//td").extract() if len(data) == len(title): index1, index2 = 2, 3 else: index1, index2 = 1, 2 capacity = parsel.Selector( data[index1]).xpath("//span/text()").extract_first() rate = parsel.Selector( data[index2]).xpath("//span/text()").extract_first() if capacity is None or rate is None: continue else: capacity_num = Decimal(capacity) rate_num = Decimal(rate[:-1]) / 100 value += capacity_num * rate_num value = (value / 52).quantize(Decimal('0')) insert_value(date, value, self.index_id)
def parse(self, response, **kwargs): date = response.meta.get('date') sentence = response.xpath( "//div[@id='content']//p/text()").extract_first() value = Decimal(re.search(r"(?<=约)\d+(\.\d+)?", sentence).group()) insert_value(date, value, self.index_id)
def parse(self, response, **kwargs): date = response.meta.get('date') # 开工率指标匹配 sentence = response.xpath( "//div[@id='content']//p/text()").extract_first() rate_value = Decimal( re.search(r"(?<=开工率)\d+(\.\d+)?", sentence).group()) # 国内产量指标计算 title = response.xpath( "//div[@id='content']//tr[@class='firstRow']/td//span/text()" ).getall() rows = response.xpath( "//div[@id='content']//tr[not(contains(@class, 'firstRow'))]" ).getall() capacity_value = Decimal(0) for row in rows: data = parsel.Selector(row).xpath("//td").extract() if len(data) == len(title): index1, index2 = 2, 3 else: index1, index2 = 1, 2 capacity = parsel.Selector( data[index1]).xpath("//span/text()").extract_first() rate = parsel.Selector( data[index2]).xpath("//span/text()").extract_first() if capacity is None or rate is None: continue else: capacity_num = Decimal(capacity) rate_num = Decimal(rate.replace('%', '')) / 100 capacity_value += capacity_num * rate_num capacity_value = (capacity_value / 52).quantize(Decimal('0')) insert_value(date, rate_value, self.rate_index_id) insert_value(date, capacity_value, self.capacity_index_id)
def parse(self, response, **kwargs): date = response.meta.get('date') res = response.text a = re.findall('(\d+)-(\d+)元', str(res)) sd_price = Decimal(a[-2][0]) hd_price = Decimal(a[-1][0]) insert_value(date, sd_price, self.sd_index_id) insert_value(date, hd_price, self.hd_index_id)
def parse(self, response, **kwargs): date = response.meta.get('date') val = [] tr = response.css('#content > table > tbody > tr') row_count = int( response.xpath( '//*[@id="content"]/table/tbody/tr[2]/td[1]/@rowspan').get()) count_1 = 1 tmp_1 = response.xpath( '//*[@id="content"]/table/tbody/tr[2]/td[2]/@rowspan').get() if tmp_1 is not None: count_1 = int(tmp_1) count_2 = 1 tmp_2 = response.xpath( '//*[@id="content"]/table/tbody/tr[{}]/td[1]/@rowspan'.format( count_1 + 1 + 1)).get() if tmp_2 is not None: count_2 = int(tmp_2) count_3 = 1 tmp_3 = response.xpath( '//*[@id="content"]/table/tbody/tr[{}]/td[1]/@rowspan'.format( count_1 + count_2 + 1 + 1)).get() if tmp_3 is not None: count_3 = int(tmp_3) count_4 = 1 tmp_4 = response.xpath( '//*[@id="content"]/table/tbody/tr[{}]/td[1]/@rowspan'.format( count_1 + count_2 + count_3 + 1 + 1)).get() if tmp_4 is not None: count_4 = int(tmp_4) # 茂名石化 for i in tr[1:count_1 + 1]: temp = i.css('td:nth-child(2) > p > span::text').get() if re.fullmatch(r'\d*', temp) is None: temp = i.css('td:nth-child(4) > p > span::text').get() val.append(temp) # 广州石化 for j in tr[count_1 + 1:count_1 + count_2 + 1]: temp = j.css('td:nth-child(2) > p > span::text').get() if re.fullmatch(r'\d*', temp) is None: temp = j.css('td:nth-child(3) > p > span::text').get() val.append(temp) # 福建联合 for k in tr[count_1 + count_2 + 1:count_1 + count_2 + count_3 + 1]: temp = j.css('td:nth-child(2) > p > span::text').get() if re.fullmatch(r'\d*', temp) is None: temp = j.css('td:nth-child(3) > p > span::text').get() if j.css('td:first-child > p > span::text').get() == 'HPPSS': continue val.append(temp) # 跳过海南炼厂 for q in tr[count_1 + count_2 + count_3 + count_4 + 1:row_count + 1]: temp = j.css('td:nth-child(2) > p > span::text').get() if re.fullmatch(r'\d*', temp) is None: temp = j.css('td:nth-child(3) > p > span::text').get() val.append(temp) value = Decimal(min(val)) insert_value(date, value, self.index_id)
def parse(self, response, **kwargs): date = response.meta.get('date') val = [] tr = response.css('#content > table > tbody > tr') for i in tr: factory = i.css('td:first-child > p > span::text').get() if factory is not None and factory[0:2] in self.factory_list: val.append(i.css('td:nth-child(4) > p > span::text').get()) value = Decimal(min(val)) insert_value(date, value, self.index_id)
def parse(self, response, **kwargs): date = response.meta.get('date') val = response.css('div#content tbody > tr:last-child td:last-child::text').get() # 2019-06-14 之前的DOM结构 if val is None: val = response.css('div#content tbody > tr:nth-child(11) > td:nth-child(4) > strong::text').get() # 2018-10-26 之前的DOM结构 if val is None: val = response.css('div#content tbody > tr:last-child > td:last-child > p::text').get() value = Decimal(val) insert_value(date, value, self.index_id)
def parse_next_page(self, response): try: content = json.loads(response.text) obj_id = response.meta['obj_id'] if self.mode == 1: for c in content['pageInfo']['list']: date = c['indexDate'].replace("/", "-") value = c['indexValue'] insert_value(date, value, obj_id) else: pass except Exception as e: print('!' * 30) print(e) print('!' * 30)
def parse(self, response, **kwargs): date = response.meta.get('date') row = response.xpath( "//div[@id='content']//tbody/tr[2]/td[1]/@rowspan").get() row = int(row) tr_list = response.xpath( f"//div[@id='content']//tbody/tr[position()>1][position()<{row+1}]" ) prices = [] count = 1 for tr in tr_list: if count == 1: price = tr.xpath('./td[5]//text()').extract_first() else: price = tr.xpath('./td[4]//text()').extract_first() # 26页之后特殊情况:将数字拆分到不同的标签 if int(price) < 1000: if count == 1: price = tr.xpath('./td[5]//text()').extract() else: price = tr.xpath('./td[4]//text()').extract() price = ''.join(price) # 2019-9-9开始数据的列发生变化 if date <= "2019-09-09": if count == 1: price = tr.xpath('./td[4]//text()').extract_first() else: price = tr.xpath('./td[3]//text()').extract_first() # 26页之后特殊情况:将数字拆分到不同的标签 if int(price) < 1000: if count == 1: price = tr.xpath('./td[4]//text()').extract() else: price = tr.xpath('./td[3]//text()').extract() price = ''.join(price) count += 1 try: # 有缺失值为"-",报异常,跳过 prices.append(int(price)) except Exception as e: pass price = min(prices) insert_value(date, price, self.index_id)
def parse(self, response, **kwargs): date = response.meta.get('date') rows = response.xpath( "//div[@id='content']//tr[not(contains(@class, 'firstRow'))]" ).getall() for row in rows: production, capacity, rate = parsel.Selector(row).xpath( "//td/text()").extract() if production == '辛醇': capacity_num = Decimal( re.search('^(\-|\+)?\d+(\.\d+)?', capacity).group()) rate_num = Decimal( re.search('^(\-|\+)?\d+(\.\d+)?', rate).group()) value = ((capacity_num * rate_num / 10) / 52).quantize( Decimal('0')) insert_value(date, value, self.index_id) break
def parse(self, response, **kwargs): date = response.meta.get('date') row = response.xpath( "//div[@id='content']//tbody/tr[2]/td[1]/@rowspan").get() row = int(row) tr_list = response.xpath( f"//div[@id='content']//tbody/tr[position()>1][position()<{row+1}]" ) prices = [] count = 1 for tr in tr_list: if count == 1: price = tr.xpath('./td[5]//text()').extract_first() else: price = tr.xpath('./td[4]//text()').extract_first() count += 1 prices.append(int(price)) price = min(prices) insert_value(date, price, self.index_id)
def parse(self, response, **kwargs): date = response.meta.get('date') draw_p = response.css('#content > p:nth-child(24) > span::text').get() fiber_p = response.css('#content > p:nth-child(25) > span::text').get() merchant_p = response.css( '#content > p:nth-child(31) > span::text').get() plastic_woven_p = response.css( '#content > p:nth-child(48) > span::text').get().replace(" ", "") bopp_p = response.css('#content > p:nth-child(54) > span::text').get() draw_value = Decimal( re.search(r"(?<=企业库存在)\d+(\.\d+)?", draw_p).group()) fiber_value = Decimal( re.search(r"(?<=企业库存在)\d+(\.\d+)?", fiber_p).group()) merchant_re = re.search(r"(?<=贸易商库存较上周增加)\d+(\.\d+)?", merchant_p) if merchant_re: merchant_value = Decimal(merchant_re.group()) else: merchant_value = -Decimal( re.search(r"(?<=贸易商库存较上周[\u4e00-\u9fa5]{2})\d+(\.\d+)?", merchant_p).group()) plastic_woven_re = re.search(r"(?<=原料库存天数较上周上涨)\d+(\.\d+)?", plastic_woven_p) if plastic_woven_re: plastic_woven_value = Decimal(plastic_woven_re.group()) else: plastic_woven_value = -Decimal( re.search(r"(?<=原料库存天数较上周[\u4e00-\u9fa5]{2})\d+(\.\d+)?", plastic_woven_p).group()) bopp_re = re.search(r"(?<=BOPP原料库存较上周上涨)\d+(\.\d+)?", bopp_p) if bopp_re: bopp_value = Decimal(bopp_re.group()) else: bopp_value = -Decimal( re.search(r"(?<=BOPP原料库存较上周[\u4e00-\u9fa5]{2})\d+(\.\d+)?", bopp_p).group()) insert_value(date, draw_value, self.draw_index_id) insert_value(date, fiber_value, self.fiber_index_id) insert_value(date, merchant_value, self.merchant_index_id) insert_value(date, plastic_woven_value, self.plastic_woven_index_id) insert_value(date, bopp_value, self.bopp_index_id)
def parse(self, response, **kwargs): date = response.meta.get('date') val = [] row_count = int( response.xpath( '//*[@id="content"]/table/tbody/tr[2]/td[1]/@rowspan').get()) count = 1 tr = response.css('#content > table > tbody > tr') for i in tr[1:row_count + 1]: company = i.xpath('td[1]/p/span/text()').get() if company[0:2] == '中天': tmp = i.xpath('td[1]/@rowspan').get() if tmp is not None: count = int(tmp) for j in tr[1:row_count - count + 1]: temp = j.css('td:nth-child(3) > p > span::text').get() if re.fullmatch(r'\d*', temp) is None: temp = j.css('td:nth-child(4) > p > span::text').get() val.append(temp) value = Decimal(min(val)) insert_value(date, value, self.index_id)
def parse(self, response, **kwargs): try: title = response.xpath( '//*[@id="content"]/table/tbody/tr[@class="firstRow"]/td//text()' ).getall() if not '用途' in title: pass else: # 经过观察,表的第一行里面要么有“产能”,要么有“产能(万吨/年)” flag = 0 if '产能' in title else 1 date = response.xpath( "/html/body/div[8]/div[2]/div[1]/div[1]/div[1]/span/text()" ).get() date = parse_date(date) temp = response.xpath( "//*[@id='content']/table/tbody/tr[not(contains(@class, 'firstRow'))]" ) content = [] for t in temp: td = t.xpath('.//td') temp_list = [] for t_td in td: text = t_td.xpath(".//text()").getall() temp_str = "".join(text) temp_list.append(temp_str) content.append(temp_list) wb = Workbook() wb.remove(wb['Sheet']) st = wb.create_sheet(self.sheet_name) row = 2 last_name_col1, last_name_col2, last_name_col3 = None, None, None for ti in range(len(title)): st.cell(1, ti + 1).value = title[ti] for c in content: if len(c) == len(title): last_name_col1 = c[0] last_name_col2 = c[1] last_name_col3 = c[2] for col in range(len(c)): st.cell(row, col + 1).value = c[col] elif len(c) == len(title) - 1 and len(title) - 1 > 0: st.cell(row, 1).value = last_name_col1 last_name_col2 = c[0] last_name_col3 = c[1] for col in range(len(c)): st.cell(row, col + 2).value = c[col] elif len(c) == len(title) - 2 and len(title) - 2 > 0: st.cell(row, 1).value = last_name_col1 st.cell(row, 2).value = last_name_col2 last_name_col3 = c[0] for col in range(len(c)): st.cell(row, col + 3).value = c[col] else: st.cell(row, 1).value = last_name_col1 st.cell(row, 2).value = last_name_col2 st.cell(row, 3).value = last_name_col3 for col in range(len(c)): st.cell(row, col + 4).value = c[col] row += 1 wb.save(date + '.xlsx') ex_pd = pd.read_excel(date + '.xlsx', sheet_name=self.sheet_name) if flag == 0: ex_pd_c = ex_pd['产能'] ex_pd_c_filter = ex_pd[ex_pd['用途'] == '--']['产能'] else: ex_pd_c = ex_pd['产能(万吨/年)'] ex_pd_c_filter = ex_pd[ex_pd['用途'] == '--']['产能(万吨/年)'] capacity_of_production = Decimal(sum(ex_pd_c)).quantize( Decimal("0.0000")) repair = Decimal(sum(ex_pd_c_filter) / 365).quantize( Decimal("0.0000")) capacity_sub_repair = Decimal(capacity_of_production / 365 - repair).quantize( Decimal("0.0000")) insert_value(date, capacity_of_production, self.capacity_of_production_id) insert_value(date, capacity_sub_repair, self.capacity_sub_repair_id) insert_value(date, repair, self.repair_id) except Exception as e: print('!' * 30) print('step2') print(e) print('!' * 30)
def parse(self, response, **kwargs): date = response.meta.get('date') p = response.css('#content > p::text').get() value = Decimal(re.search(r"(?<=两油库存)\d+(\.\d+)?", p).group()) insert_value(date, value, self.index_id)
def parse(self, response, **kwargs): date = response.meta.get('date') # 进口数据 import_1 = response.xpath( "//div[@id='content']//tr[2]/td[2]/text()").extract_first() import_2 = response.xpath( "//div[@id='content']//tr[2]/td[3]/text()").extract_first() import_3 = response.xpath( "//div[@id='content']//tr[2]/td[4]/text()").extract_first() import_junju = Decimal(import_1) import_gongju = Decimal(import_2) + Decimal(import_3) import_total = Decimal(import_1) + Decimal(import_2) + Decimal( import_3) # 出口数据 export_1 = response.xpath( "//div[@id='content']//tr[2]/td[6]/text()").extract_first() export_2 = response.xpath( "//div[@id='content']//tr[2]/td[7]/text()").extract_first() export_3 = response.xpath( "//div[@id='content']//tr[2]/td[8]/text()").extract_first() export_junju = Decimal(export_1) export_gongju = Decimal(export_2) + Decimal(export_3) export_total = Decimal(export_1) + Decimal(export_2) + Decimal( export_3) # 插入 insert_value(date, import_junju, self.import_junju_index_id) insert_value(date, import_gongju, self.import_gongju_index_id) insert_value(date, import_total, self.import_total_index_id) insert_value(date, export_junju, self.export_junju_index_id) insert_value(date, export_gongju, self.export_gongju_index_id) insert_value(date, export_total, self.export_total_index_id)
def parse(self, response, **kwargs): date = response.meta.get('date') value = Decimal(response.css('div#content tbody > tr:last-child td:last-child::text').get()) insert_value(date, value, self.index_id)