def parse_content(self, response): doc = lxml.html.document_fromstring(response.body_as_unicode()) data_table2 = doc.xpath('//div[@id="PanelContent"]//table')[0] title = ''.join( response.xpath( '//div[@class="news_content "]/h1//text()').extract()).strip() logging.debug(title) data_date = datetime.datetime.strptime( response.meta['data_date'][0:10], '%Y-%m-%d') data_list = table_to_list(data_table2) if len(data_list) <= 1 or len(data_list[1]) < 5: raise 'PLAS.CHEM99----get table failed %s' % response.url for row in data_list[1:]: item = MarketPricePP() item['materials'] = row[0].strip() item['product'] = row[1].strip() item['price'] = row[2].strip() item['rise_offset'] = row[3].strip() item['remarks'] = row[4].strip() item['datadate'] = data_date item['update_dt'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item['source'] = title yield item
def parse_content(self, response): data_table = response.xpath('//div[@id="PanelContent"]//table')[0] doc = lxml.html.document_fromstring(response.body_as_unicode()) data_table2 = doc.xpath('//div[@id="PanelContent"]//table')[0] #logging.debug(data_table2) #logging.debug(lxml.html.tostring(data_table2)) title = ''.join( response.xpath( '//div[@class="news_content "]/h1//text()').extract()).strip() logging.debug(title) data_date = re.compile('(\d{8})').search(title).group(1) data_list = table_to_list(data_table2) for row in data_list[1:]: item = FactoryPrice() item['region'] = row[0].strip() item['produce_code'] = row[1].strip() item['produce_name'] = row[2].strip() item['pre_price'] = row[3].strip() item['price'] = row[4].strip() item['rise_offset'] = row[5].strip() if len(row) >= 7: item['remarks'] = row[6].strip() item['remarks'] = row[6].strip() item['title'] = title item['trading_dt'] = data_date item['datetime_stamp'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') yield item
def parse_content(self, response): doc = lxml.html.document_fromstring(response.body_as_unicode()) data_table2 = doc.xpath('//div[@id="PanelContent"]//table')[0] title = ''.join( response.xpath( '//div[@class="news_content "]/h1//text()').extract()).strip() auth_info = ''.join( response.xpath('//div[@class="news_title_b"]//text()').extract()) pub_date = re.compile('(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})').search( auth_info).group(1) logging.debug(title) data_list = table_to_list(data_table2) tab_title = data_list[0][1].strip() for row in data_list[1:]: item = PriceSXSY() item['produce_code'] = row[0].strip() item['price'] = row[1].strip() item['rise_offset'] = row[2].strip() item['remarks'] = row[3].strip() item['title'] = title item['trading_dt'] = pub_date item['tab_title'] = tab_title item['datetime_stamp'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') yield item
def parse_content(self, response): datadate = response.meta['datadate'] datatype = response.meta['datatype'] print datatype data_table = response.xpath('//*[@id="Panel_News"]/div[1]/table') data_list = table_to_list(data_table) if len(data_list[0]) > 7: datemonth = data_list[0][1] for data in data_list[1:]: item = t_chem99_bithumen_prod_Item() item['datadate'] = datadate item['datemonth'] = datemonth if datatype == 1: item['cls_type'] = u'地区' elif datatype == 2: item['cls_type'] = u'集团' item['item_name'] = data[0] item['curr_month_value'] = data[1] item['pre_month_value'] = data[2] item['mom'] = data[3] item['pre_year_value'] = data[4] item['yoy'] = data[5] item['cumu_value_y'] = data[6] item['pre_cumu_value_y'] = data[7] item['cumu_yoy'] = data[8] item['update_dt'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item['source'] = response.url yield item
def parse_content(self, response): date = response.meta['date'] title = response.meta['title'] remark = ''.join( response.xpath( '//div[@id="Panel1"]/p[1]//text()').extract()).strip() logging.debug(title) doc = lxml.html.document_fromstring(response.body_as_unicode()) if doc.xpath('//form[@id="frm_login"]'): raise Exception('Login error') if doc.xpath('//div[@id="Panel1"]//table'): data_table = doc.xpath('//div[@id="Panel1"]//table')[0] data_list = table_to_list(data_table) if len(data_list) <= 1 or len(data_list[0]) < 5: raise Exception('RUBB.CHEM99----get table failed %s' % response.url) row = data_list[len(data_list) - 1] for index in range(1, len(data_list[0])): item = RubbThailand() item['product'] = data_list[0][index].strip() item['price'] = row[index].strip() item['remark'] = remark try: item['datadate'] = datetime.datetime.strptime( row[0].strip(), '%Y/%m/%d') except: item['datadate'] = date item['update_dt'] = datetime.datetime.now() item['source'] = title yield item
def parse_content(self, response): data_table = response.xpath('//div[@id="PanelContent"]//table')[0] doc = lxml.html.document_fromstring(response.body_as_unicode()) data_table2 = doc.xpath('//div[@id="PanelContent"]//table')[0] title = ''.join( response.xpath( '//div[@class="news_content "]/h1//text()').extract()).strip() logging.debug(title) data_date = datetime.datetime.strptime( re.compile('(\d{8})').search(title).group(1), '%Y%m%d') data_list = table_to_list(data_table2) if len(data_list) <= 1 or len(data_list[1]) < 4: raise 'CHEM99----get table failed %s' % response.url for row in data_list[1:]: for index in range(1, len(row)): item = PlasticFarmFilm() item['product'] = row[0].strip() item['area'] = data_list[0][index].strip() item['price'] = row[index].strip() item['datadate'] = data_date item['update_dt'] = datetime.datetime.now() item['source'] = title yield item
def parse_content(self, response): datadate = response.meta['datadate'] datatype = response.meta['datatype'] if datatype == 3: print response.url if datatype == 1: data_table = response.xpath('//*[@id="Panel_News"]/div[1]/table') data_list = table_to_list(data_table) last_week_date = datetime.datetime.strptime( datadate, '%Y-%m-%d') - datetime.timedelta(days=7) for data in data_list[1:]: item = t_ec_rateofoperation_bitumenItem() item['datadate'] = datadate item['area'] = data[0] item['current_week_date'] = datadate item['last_week_date'] = last_week_date.strftime('%Y-%m-%d') item['current_week_value'] = data[1] item['last_week_value'] = data[2] item['change_situation'] = data[3] item['update_dt'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item['source'] = response.url yield item if datatype == 2: data_table = response.xpath('//*[@id="Panel_News"]/div[1]/table') data_list = table_to_list(data_table) print data_list for data in data_list[1:]: item = t_ec_check_bitumenItem() if len(data[0]) < 6: item['area'] = data[0] item['datadate'] = datadate item['factory_name'] = data[1] item['affiliation'] = data[2] item['product'] = data[3] item['status'] = data[4] item['product_time'] = data[5] item['update_dt'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item['source'] = response.url yield item else: continue
def parse_content(self, response): doc = lxml.html.document_fromstring(response.body_as_unicode()) data_table2 = doc.xpath('//div[@id="PanelContent"]//table')[1] title = ''.join( response.xpath( '//div[@class="news_content "]/h1//text()').extract()).strip() auth_info = ''.join( response.xpath('//div[@class="news_title_b"]//text()').extract()) pub_date = re.compile('(\d{8})').search(title).group(1) logging.debug(title) for p_text in response.xpath( '//div[@id="PanelContent"]//p//text() | //div[@id="PanelContent"]//div//text()' ).extract(): if (u'表' in p_text) and \ (u'国内' in p_text) and \ (u'丙烯' in p_text) and \ (u'价格一览' in p_text): tab_title = p_text.strip() data_list = table_to_list(data_table2) item1 = PropeneMonomer() item1['column_type'] = u'价格' item1['sd_area'] = data_list[1][1].strip() item1['hb_area'] = data_list[1][2].strip() item1['hd_area'] = data_list[1][3].strip() item1['xb_area'] = data_list[1][4].strip() item1['db_area'] = data_list[1][5].strip() item1['hn_area'] = data_list[1][6].strip() item1['title'] = title item1['tab_title'] = tab_title item1['trading_dt'] = pub_date item1['datetime_stamp'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item2 = PropeneMonomer() item2['column_type'] = u'涨跌' item2['sd_area'] = data_list[2][1].strip() item2['hb_area'] = data_list[2][2].strip() item2['hd_area'] = data_list[2][3].strip() item2['xb_area'] = data_list[2][4].strip() item2['db_area'] = data_list[2][5].strip() item2['hn_area'] = data_list[2][6].strip() item2['title'] = title item2['tab_title'] = tab_title item2['trading_dt'] = pub_date item2['datetime_stamp'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') return [item1, item2]
def parse_content(self, response): date = response.meta['date'] title = response.meta['title'] remark = ''.join( response.xpath( '//div[@id="Panel1"]/p[1]//text()').extract()).strip() logging.debug(title) doc = lxml.html.document_fromstring(response.body_as_unicode()) data_table = doc.xpath('//div[@id="Panel1"]//table')[0] data_list = table_to_list(data_table) if not (len(data_list[0]) == 11 or len(data_list[0]) == 7): raise Exception('RUBB.CHEM99----get table failed %s' % response.url) if len(data_list[0]) == 11: for row in data_list[2:]: item = RubbUSSThailand() item['product'] = row[0].strip() item['price'] = row[1].strip() item['price_3_5'] = row[2].strip() item['price_5_7'] = row[3].strip() item['price_7_10'] = row[4].strip() item['price_10_15'] = row[5].strip() item['volume'] = row[6].strip() item['volume_3_5'] = row[7].strip() item['volume_5_7'] = row[8].strip() item['volume_7_10'] = row[9].strip() item['volume_10_15'] = row[10].strip() item['remark'] = remark item['datadate'] = date item['update_dt'] = datetime.datetime.now() item['source'] = title yield item elif len(data_list[0]) == 7: for row in data_list[1:]: item = RubbUSSThailand() item['product'] = row[0].strip() item['price'] = row[1].strip() item['price_3_5'] = row[2].strip() item['price_5_7'] = row[3].strip() item['price_7_10'] = row[4].strip() item['price_10_15'] = row[5].strip() item['remark'] = remark item['datadate'] = date item['update_dt'] = datetime.datetime.now() item['source'] = title yield item
def parse_content(self, response): datadate = response.meta['datadate'] data_table = response.xpath('//*[@id="Panel_News"]/div[1]/table') data_list = table_to_list(data_table) wti_date_range = re.findall(u'WTI均价((.+))',data_list[0][1])[0] for data in data_list[1:]: item = t_ec_merey_oil_Item() item['datadate'] = datadate item['datemonth'] = data[0] item['wti_price_avg'] = data[1] item['wti_date_range'] = wti_date_range item['discount_value'] = data[2] item['tongs_barrels_ratio'] = data[3] item['fx_rate'] = data[4] item['settle_prc'] = data[5] item['update_dt'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') item['source'] = response.url yield item
def parse_content(self, response): data_table = response.xpath('//div[@id="PanelContent"]//table')[0] doc = lxml.html.document_fromstring(response.body_as_unicode()) data_table2 = doc.xpath('//div[@id="PanelContent"]//table')[0] title = ''.join( response.xpath( '//div[@class="news_content "]/h1//text()').extract()).strip() logging.debug(title) data_list = table_to_list(data_table2) if len(data_list) <= 1 or len(data_list[1]) < 7: raise 'CHEM99----get table failed %s' % response.url datematch = re.search('(\d{8})', title) if datematch: data_date = datetime.datetime.strptime(datematch.group(1), '%Y%m%d') else: year = re.search('(\d{7})', title).group(1)[0:4] monday = re.search(u'(\d+)月(\d+)日', data_list[0][2]) month = monday.group(1) day = monday.group(2) data_date = datetime.datetime(int(year), int(month), int(day)) for row in data_list[1:]: item = PlasticFilm() item['product'] = row[0].strip() item['spec'] = row[1].strip() item['price'] = row[2].strip() item['rise_offset'] = row[3].strip() item['than_lastweek'] = row[4].strip() item['than_lastmonth'] = row[5].strip() item['than_lastyear'] = row[6].strip() item['datadate'] = data_date item['update_dt'] = datetime.datetime.now() item['source'] = title yield item
def parse_content(self, response): doc = lxml.html.document_fromstring(response.body_as_unicode()) data_table = doc.xpath('//div[@id="Panel_News"]//table')[0] title = ''.join( response.xpath( '//div[@class="div_news"]/h1//text()').extract()).strip() logging.debug(title) data_date = datetime.datetime.strptime(response.meta['data_date'], '%Y-%m-%d') data_list = table_to_list(data_table) #行列转置,一共5列 data_list = trans_table(data_list[0:5]) if len(data_list) <= 1 or len(data_list[1]) < 5: logging.error('OIL.CHEM99----get table failed %s' % response.url) unit = None match = re.search( u'单位:(.*)', doc.xpath('//div[@id="Panel_News"]//p/text()')[1].strip()) if match: unit = match.group(1) for row in data_list[1:]: item = MarketReviewBitumen() item['area'] = row[0].strip() item['pre_price'] = row[1].strip() item['price'] = row[2].strip() item['change'] = row[3].strip() item['changeratio'] = row[4].strip() if unit: item['unit'] = unit item['datadate'] = data_date item['update_dt'] = datetime.datetime.now() item['source'] = title yield item
def parse_content(self, response): data_table = response.xpath('//*[@id="Panel_News"]/div[1]/table') data_list = table_to_list(data_table) a = 'aaa'