def parse_stock(self, response): hxs = HtmlXPathSelector(response) #log.msg(response.body) item = StockItem() #print hxs.select('//div/h1') #print hxs.select('//div/h1/a/text()').extract() #print hxs.select('//div/h1/a/strong/text()').extract() item['name'] = hxs.select('//div/h1/a/strong/text()').extract()[0] item['code'] = hxs.select('//div/h1/a/text()').extract()[1].strip( " \t\n") #print item company_node = hxs.select('//dl[contains(@class, "company_details")]') strong_list = company_node.select('.//dd/strong/text()').extract() #print strong_list item['captial'] = float(strong_list[0]) item['out_captial'] = float(strong_list[1]) item['profit'] = float(strong_list[4]) item['assets'] = float(strong_list[5]) #print item company_url = "http://stockpage.10jqka.com.cn/" + item[ 'code'] + "/company/" request = Request(company_url, callback=self.parse_company) request.meta['item'] = item yield request
def parse_json(self, response): parts = response.body.split("=") content = safestr(parts[1].decode('gbk')) #print content data = json.loads(content) item_list = data['data']['result'] print len(item_list) for info in item_list: item = StockItem() item['location'] = 3 code = info[0] code_parts = code.split(".") if len(code_parts) >= 2: ecode = code_parts[-1] if "N" == ecode: item['ecode'] = "NYSE" elif "OQ" == ecode: item['ecode'] = "NASDAQ" item['name'] = info[2] item['code'] = info[1] stock_url = "http://stockhtm.finance.qq.com/astock/ggcx/" + code + ".htm" #print stock_url request = Request(stock_url, callback=self.parse_data) request.meta['item'] = item yield request
def parse(self, response): #for sel in response.xpath('//div[@id="base_info"]//ul//li'): # print sel.extract() # #例如获取股息率 市盈率 # item = StockItem() # yield item res_td = r'<td>(.*?)</td>' for sel in response.xpath('//table[@id="financial_analysis"]//tr'): self.index = 0 item = StockItem() link = response.url item["link"] = link item["num"] = link.split("/")[-1][:-5] for trinfo in sel.xpath( 'td[not(contains(@class, "showRedTips"))]'): self.index += 1 if self.index == 6: line = trinfo.extract() m_td = re.findall(res_td, line, re.S | re.M) item["peratio"] = m_td elif self.index == 10: line = trinfo.extract() m_td = re.findall(res_td, line, re.S | re.M) item["roe"] = m_td elif self.index == 11: line = trinfo.extract() m_td = re.findall(res_td, line, re.S | re.M) item["dividend"] = m_td if self.index == 11: yield item else: continue
def parse_page(self, response): try: item = StockItem() # 股票名 item['name'] = response.xpath( '//a[@class="bets-name"]/text()').extract()[0].strip()[:4] # 收盘价 item['close_price'] = response.xpath( '//div[@class="line2"]//dl[1]/dd/text()').extract()[0] # 开盘价 item['open_price'] = response.xpath( '//div[@class="line1"]//dl[1]/dd/text()').extract()[0] # 当前价 item['cur_price'] = response.xpath('//strong/text()').extract()[0] # 最高价 item['highest_price'] = response.xpath( '//div[@class="line1"]/dl[3]/dd/text()').extract()[0] # 最低价 item['lowest_price'] = response.xpath( '//div[@class="line2"]/dl[3]/dd/text()').extract()[0] # 成交量 item['volume'] = response.xpath( '//div[@class="line1"]/dl[2]/dd/text()').extract()[0] # 换手率 item['change_rate'] = response.xpath( '//div[@class="line2"]/dl[2]/dd/text()').extract()[0] yield item except: pass
def handlePage(self, response): print("处理列表页面") trs = response.xpath("//tr") #open("test.html" , "wb+").writelines(trs).close() for tr in trs[1:]: # 股票代码 简称 number = tr.xpath(".//a[@target='_blank']")[0].xpath('text()').extract()[0] name = tr.xpath(".//a[@target='_blank']")[1].xpath('text()').extract()[0] # 股票价格 price = tr.xpath("./td[4]").xpath('text()').extract()[0] # 股票涨幅 gains = tr.xpath("./td[5]").xpath('text()').extract()[0] # 股票换手率 rate = tr.xpath("./td[8]").xpath('text()').extract()[0] # 股票量比 thanCarrie = tr.xpath("./td[9]").xpath('text()').extract()[0] # 股票振幅 amplitude = tr.xpath("./td[10]").xpath('text()').extract()[0] print(name) stock = StockItem() stock['stock_id'] = number stock['stock_name'] = name stock['stock_price'] = price stock['stock_gains'] = gains stock['stock_rate'] = rate stock['stock_thanCarrie'] = thanCarrie stock['stock_amplitude'] = amplitude stock['today'] = time.strftime("%Y-%m-%d", time.localtime()) yield stock
def parse(self, response): item = StockItem() stock_codes = response.xpath('//*[@id="quotesearch"]/ul[1]/li') for title in stock_codes: item['stock_code'] = title.xpath('.//a[@target="_blank"]/text()').extract() yield item
def down_gu(self, response): stock_item = StockItem() # table = [] print(response.url) tr_list = response.xpath('//table[@class="m-table"]/tbody/tr') for td in tr_list: content_list = td.xpath('./td/text()').extract() content_list[1] = content_list[1].strip() # print(content_list) stock_item['xuhao'] = content_list[0] stock_item['jysj'] = content_list[1] stock_item['rz_ye'] = self.str2num(content_list[2]) stock_item['rz_mre'] = self.str2num(content_list[3]) stock_item['rz_che'] = self.str2num(content_list[4]) stock_item['rz_rzjmr'] = self.str2num(content_list[5]) stock_item['rq_ye'] = self.str2num(content_list[6]) stock_item['rq_mre'] = self.str2num(content_list[7]) stock_item['rq_che'] = self.str2num(content_list[8]) stock_item['rq_rzjmr'] = self.str2num(content_list[9]) stock_item['rzrqye'] = self.str2num(content_list[10]) print(stock_item) yield stock_item # print('】,【'.join(content_list)) # table.append('|,|'.join(content_list)) # print(response.url) # print(table) # with open('C:\\Users\\Administrator\\Desktop\\papa\\stock\\gupiao\\'+response.meta["gu_name"]+'.txt', 'a') as f: # f.write('\n'.join(table)+'\n') # f.close() # f.write('\n'.join(table)+'\n') # 还有很多页,是动态加载的,这里我们只取前三页。 if response.meta['index'] > 3: return response.meta['index'] += 1 # http://data.10jqka.com.cn/market/rzrqgg/code/000725/order/desc/page/2/ajax/ 1 / page_url = 'http://data.10jqka.com.cn/market/rzrqgg/3/ajax/code/' + str( response.meta['gu_hao']) + '/desc/page/order/desc/page/' + str( response.meta['index']) + '/ajax/1/' yield scrapy.Request(url=page_url, callback=self.down_gu, meta={ "gu_name": response.meta['gu_name'], "gu_hao": response.meta['gu_hao'], "index": response.meta['index'], })
def parse(self, response): url = 'https://smart.tdcc.com.tw/opendata/getOD.ashx?id=1-5' data = pd.read_csv(url) self.copy2Hist() # 跑前,將資料搬到歷史區 self.clearTable() # 清除目前的要運算的表 # csv = data[data['資料日期'] == 15][:1] for index, row in data.iterrows(): item = StockItem() item['data_date'] = row[0] #資料日期 item['stock_no'] = row[1].zfill(6) #證券代號 item['stock_num'] = row[4] #持股分級 item['level'] = row[2] #人數 item['holder_num'] = row[3] #股數 item['percent'] = row[5] #占集保庫存數比例% yield item
def parse(self, response): td = datetime.date.today().strftime('%Y%m%d') f = open("holding_data/" + td + ".csv", "w") f.write("stock_id ;stock_name; holding_num; holding_perc \n") for sel in response.xpath("//tr").extract(): item = StockItem() info = Selector(text=sel).xpath("//td/text()").extract() if len(info) != 4: continue a = info[0].split("\r\n")[1] b = info[1].split("\r\n")[1] c = info[2].split("\r\n")[1] d = info[3].split("\r\n")[1] f.write(a + ";" + b + ";" + c + ";" + d + "\n") f.close()
def parse_quotes(self, response): content = safestr(response.body) quotes_data = json.loads(content) for quote_info in quotes_data['quotes']: # 已退市 if 3 == int(quote_info['flag']): print "op=stock_quit code=" + safestr(quote_info['symbol']) + " name=" + safestr(quote_info['name']) continue item = StockItem() item['location'] = 3 item['code'] = quote_info['symbol'] item['name'] = quote_info['name'] stock_name = safestr(quote_info['name']) exchange = safestr(quote_info['exchange']) if exchange == "NASDAQ": item['ecode'] = 4 elif exchange == "NYSE": item['ecode'] = 5 else: # 非nasdaq/nyse的美股忽略 #print quote_info print "op=stock_ignore code=" + safestr(quote_info['symbol']) + " name=" + stock_name + " exchange=" + exchange continue # 总股本 if len(quote_info['totalShares']) > 0: item['out_captial'] = float(quote_info['totalShares']) / 100000000 # 股息 if len(quote_info['dividend']) > 0: item['dividend'] = float(quote_info['dividend']) # 每股净利润 if len(quote_info['eps']) > 0: item['profit'] = float(quote_info['eps']) # 每股净资产 if len(quote_info['net_assets']) > 0: item['assets'] = float(quote_info['net_assets']) #print item yield item
def parse(self, response): print(response.url) div_list = response.xpath("//div[contains(@class,'category')]") # 大分类列表 for div in div_list[1:2]: item = StockItem() item["b_cate"] = div.xpath("./div[@class='c_title']//h2/text()").extract_first() a_list = div.xpath(".//div[@class='option_group clearfix']/div") # 小分类列表 for a in a_list: item["s_href"] = a.xpath("./a/@href").extract_first() item["s_cate"] = a.xpath("./a/text()").extract_first() if item["s_href"] is not None: item["s_href"] = urljoin(response.url,item["s_href"]) yield scrapy.Request( item["s_href"], callback=self.parse_stock_list, meta={"item":deepcopy(item), 'download_timeout': 10, }, dont_filter=True )
def parse_stock(self, response): sItem = StockItem() try: stockInfo = response.css('.stock-bets') betsname = stockInfo.css('.bets-name') sItem['code'] = betsname.css('span::text').extract_first() sItem['name'] = betsname.css( '::text').extract_first()[1:-1].strip() state = stockInfo.css('span')[1] sItem['trade_date'] = state.re_first("\d{4}-\d{2}-\d{2}") sItem['time'] = state.re_first("\d{2}:\d{2}:\d{2}") #txtState = state.re_first("\">(.*?) ") #print(txtState) sItem['close'] = stockInfo.css('._close::text').extract_first( ) if sItem['time'][0:2] == '15' else None #15:收盘 sItem['last'] = stockInfo.css('strong::text').extract_first() valueList = stockInfo.css('dd') sItem['open'] = valueList[0].css('::text').extract_first() sItem['volume'] = valueList[1].css('::text').extract_first()[0:-2] sItem['high'] = valueList[2].css('::text').extract_first() sItem['limit_up'] = valueList[3].css('::text').extract_first() sItem['turnover'] = valueList[5].css( '::text').extract_first()[0:-1] sItem['pe'] = valueList[8].css('::text').extract_first() sItem['total_equity'] = valueList[10].css( '::text').extract_first()[0:-1] sItem['preclose'] = valueList[11].css('::text').extract_first() sItem['turnover_rate'] = valueList[12].css( '::text').extract_first()[0:-1] sItem['low'] = valueList[13].css('::text').extract_first() sItem['limit_down'] = valueList[14].css( '::text').extract_first()[1:].lstrip() sItem['volume_ratio'] = valueList[17].css( '::text').extract_first()[0:-1] sItem['flow_equity'] = valueList[21].css( '::text').extract_first()[0:-1] except: pass yield sItem
def parse(self, response): self.count = 0 for sel in response.xpath( '//li//a[contains(@href, "http://quote.eastmoney.com/hk")]/@href' ): #debug if self.count >= 10000: continue #debug link = sel.extract() num = link.split("/")[-1].split(".")[0] if num.isdigit(): num = int(num) else: continue item = StockItem() item['num'] = num item['link'] = link self.count += 1 yield item
def parse(self, response): sel = Selector(response) stocks = sel.xpath("//tbody/tr") items = [] for stock in stocks: item = StockItem() item['code'] = stock.xpath("td[2]/a/text()")[0].extract() item['link'] = stock.xpath("td[2]/a[1]/@href")[0].extract() item['name'] = stock.xpath("td[3]/a/text()")[0].extract() item['curprice'] = stock.xpath("td[4]/text()")[0].extract() item['change_rate'] = stock.xpath("td[5]/text()")[0].extract() item['change_price'] = stock.xpath("td[6]/text()")[0].extract() item['up_speed'] = stock.xpath("td[7]/text()")[0].extract() item['turnover_rate'] = stock.xpath("td[8]/text()")[0].extract() item['ratio'] = stock.xpath("td[9]/text()")[0].extract() item['amplitude'] = stock.xpath("td[10]/text()")[0].extract() item['turnover_volume'] = stock.xpath("td[11]/text()")[0].extract() item['tradable_shares'] = stock.xpath("td[12]/text()")[0].extract() item['circulation_market_value'] = stock.xpath( "td[13]/text()")[0].extract() item['P_E_ratio'] = stock.xpath("td[14]/text()")[0].extract() items.append(item) return items
def parse(self, response): text = response.text soup = BeautifulSoup(text, 'lxml') print 'cur', self.curr_stock_code start_time = soup.find('input', { 'name': 'date_start_type' }).get('value').replace('-', '') end_time = soup.find('input', { 'name': 'date_end_type' }).get('value').replace('-', '') file_item = StockItem() if len(self.curr_stock_code) > 0: stock_code_a = str(self.curr_stock_code) if int(stock_code_a[0]) in (0, 2, 3, 6, 9): if int(stock_code_a[0]) in [6, 9]: new_stock_code = '0' + stock_code_a if int(stock_code_a[0]) in [0, 2, 3]: if not int(stock_code_a[0:3]) in (201, 202, 203, 204): new_stock_code = '1' + stock_code_a download_url = 'http://quotes.money.163.com/service/chddata.html?code={}&start={}&end={}&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP'.format( new_stock_code, start_time, end_time) file_item['file_urls'] = [download_url] yield file_item
def parse(self, response): #with open("stock.txt", "w") as f: #f.write(response.text) #//div[@class ='em graph alignCenter']/a #a_list = response.xpath('//*[@id="tableWrap"]/div[2]/div/div[2]/div/table/tr[1]/td[3]/div') ''' te = open("D:/Works/PycharmProjects/stock/view-source.txt", 'rt', encoding='UTF8') file_text = te.read() te.close() response = response.replace(body = file_text) ''' item = StockItem() nodeList = response.xpath( '//*[@id="tableWrap"]/div[2]/div/div[1]/div/div/div[1]/ul/li') retryCount = 0 if (len(nodeList) == 0): #没有抓取到数据 time.sleep(1) retryCount = retryCount + 1 ''' yield scrapy.FormRequest( url = self.start_urls[0], headers = self.headers, cookies = self.cookies, method = 'GET', meta={}, formdata = self.querystring, callback = self.parse, errback = self.error, dont_filter = True ) ''' else: #attrList = response.xpath('//*[@id="tableWrap"]/div[2]/div/div[1]/div/div/div[1]/ul/li/div[starts-with(@class,"em")]').extract() #//*[@id="tableWrap"]/div[2]/div/div[1]/div/div/div[1]/ul/li[30]/dl/dt/div/span[1] #item['code'] = response.xpath('//*[@id="tableWrap"]/div[2]/div/div[2]/div/div/ul/li[3]/div/text()').extract() #item['name'] = response.xpath('//*[@id="tableWrap"]/div[2]/div/div[2]/div/div/ul/li[4]/div/text()').extract() start = time.clock() #testList = response.xpath('//*[@id="tableWrap"]/div[2]/div/div[1]/div/div/div[2]/table/tbody/tr[*]') #text = '/td[{1}]/div/' #test11 = testList.xpath(text+'a/text()' + '|' + text+'text()' + '|' + text+'span[1]/text()').extract() end = time.clock() print(str(end - start) + '\n') start = end item['minLen'] = 10000 item['code'] = response.xpath( '//div[@id="tableWrap"]/div[2]/div/div[2]/div/table/tbody/tr[*]/td[3]/div/text()' ).extract() end = time.clock() print(str(end - start) + '\n') start = end item['name'] = response.xpath( '//div[@id="tableWrap"]/div[2]/div/div[2]/div/table/tbody/tr[*]/td[4]/div/a/text()' ).extract() end = time.clock() print(str(end - start) + '\n') start = end col = 1 for node in nodeList: colNum = 0 if (len(node.xpath('./dl'))): #如果是分列的 colNum = len(node.xpath('./dl/dd')) - 1 if (len(node.xpath('./dl/dt/div/span'))): attr = node.xpath( './dl/dt/div/span[1]/text()').extract()[0] else: attr = node.xpath('./dl/dt/div/text()').extract()[0] else: if (len(node.xpath('./div[1]/span'))): #有解释 attr = node.xpath( './div[1]/span[1]/text()').extract()[0] else: attr = node.xpath('./div[1]/text()').extract()[0] end = time.clock() print(str(end - start) + '\n') start = end string = attrStrCmp(attr) print(attr + ' ') print(string + '\n') #print(type(attr), len(attr)) #print(type(string), len(string)) text = '//div[@class="scroll_tbody_con"]/table/tbody/tr[*]/td[{:d}]/div/'.format( col) ''' ddd = response.xpath(text+'a') if(len(response.xpath(text+'a'))):#有链接 item[string] = response.xpath(text+'a/text()').extract() else: item[string] = response.xpath(text+'text()').extract() ''' item[string] = response.xpath(text + 'a/text()' + '|' + text + 'text()' + '|' + text + 'span[1]/text()').extract() col = col + 1 + colNum end = time.clock() print(str(end - start) + '\n') start = end item['minLen'] = min(item['minLen'], len(item[string])) self.f = open("D:/share/自由流通市值.txt", "w") for i in range(len(item['code'])): self.f.write(item['code'][i] + ' ' + item['name'][i] + ' ' + item['自由流通股'][i] + ' ' + item['股性评分'][i] + '\n') self.f.write('\n') self.f.close() ''' self.f = open("D:/share/result11.txt", "a+") print(len(item['code'])) for i in range(len(item['code'])): self.f.write(item['code'][i] + '\n') print(len(item['name'])) for i in range(len(item['name'])): self.f.write(item['name'][i] + '\n') print(len(item['liuTongShiZhi'])) for i in range(len(item['liuTongShiZhi'])): self.f.write(item['liuTongShiZhi'][i] + '\n') self.f.close() lists = [] for i in range(len(item['code'])): #list = [item['code'][i], item['name'][i], item['股流通市值'][i], item['大单净量'][i], item['换手率'][i], item['涨跌幅'][i], item['量比'][i], \ #item['现价'][i], item['振幅'][i], item['机构动向'][i], item['股性评分'][i], item['大单净额'][i], item['主力资金流向'][i], item['中单净额'][i], \ #item['小单净额'][i], item['净利润同比增长率'][i], item['净利润'][i]] list = [] list.append(item['code'][i]) list.append(item['name'][i]) list.append(item['liuTongShiZhi'][i]) list.append(item['大单净量'][i]) list.append(item['换手率'][i]) list.append(item['涨跌幅'][i]) list.append(item['量比'][i]) list.append(item['现价'][i]) list.append(float(item['主动买入比'][i]) - float(item['主动卖出比'][i])) list.append(float(item['大单买入比'][i]) - float(item['大单卖出比'][i])) list.append(item['振幅'][i]) #10 list.append(item['机构动向'][i]) list.append(item['股性评分'][i]) list.append(item['大单净额'][i]) list.append(item['主力资金流向'][i]) list.append(item['中单净额'][i]) list.append(item['小单净额'][i]) #list.append(item['净利润同比增长率'][i]) list.append(item['净利润'][i]) # lists.append(list) lists.sort(key = choose_zhuDongMaiBi, reverse = True) print(str(len(item['code']))+'\n') print(str(len(item['name']))+'\n') print(str(len(item['a股流通市值']))+'\n') print(str(len(item['dde大单净量']))+'\n') print(str(len(item['换手率']))+'\n') print(str(len(item['涨跌幅前复权']))+'\n') print(str(len(item['量比']))+'\n') #print(str(len(item['现价']))+'\n') print(str(len(item['主动买入比']))+'\n') print(str(len(item['主动卖出比']))+'\n') print(str(len(item['振幅']))+'\n') print(str(len(item['机构动向']))+'\n') print(str(len(item['股性评分']))+'\n') print(str(len(item['dde大单净额']))+'\n') print(str(len(item['主力资金流向']))+'\n') print(str(len(item['中单净额']))+'\n') print(str(len(item['小单净额']))+'\n') print(str(len(item['净利润同比增长率']))+'\n') print(str(len(item['净利润']))+'\nyield\n') ''' yield item i = 1
def parse(self, response): page = response.body testCss = response.css('#datalist tr') for t in testCss: # 创建对象 item = StockItem() # 股票代码 stockId = t.css('td a::text').extract()[0] # 简称 Abbreviation = t.css('td a::text').extract()[1] # 最新价 latestPrice = t.css('td span::text').extract()[0] latestPrice = float(latestPrice) # 涨跌幅 quoteChange = t.css('td span::text').extract()[1] quoteChange = round(float(quoteChange.strip('%')) / 100, 4) # 涨跌额 amountChange = t.css('td span::text').extract()[2] amountChange = float(amountChange) # 5分钟涨幅 increase = t.css('td span::text').extract()[3] increase = round(float(increase.strip('%')) / 100, 4) # 成交量 volume = t.css('td::text').extract()[0] volume = float(volume) # 成交额 turnover = t.css('td::text').extract()[1] turnover = float(turnover) # 换手率 handTurnoverRate = t.css('td::text').extract()[2] handTurnoverRate = round( float(handTurnoverRate.strip('%')) / 100, 4) # 振幅 amplitude = t.css('td::text').extract()[3] amplitude = round(float(amplitude.strip('%')) / 100, 4) # 量比 volumeRatio = t.css('td::text').extract()[4] volumeRatio = float(volumeRatio) # 委比 commission = t.css('td::text').extract()[5] commission = float(commission) # 市盈率 PERatio = t.css('td::text').extract()[6] if (PERatio == '--'): PERatio = float("0") else: PERatio = float(PERatio) item['stockId'] = stockId item['Abbreviation'] = Abbreviation item['latestPrice'] = latestPrice item['quoteChange'] = quoteChange item['amountChange'] = amountChange item['increase'] = increase item['volume'] = volume item['turnover'] = turnover item['handTurnoverRate'] = handTurnoverRate item['amplitude'] = amplitude item['volumeRatio'] = volumeRatio item['commission'] = commission item['PERatio'] = PERatio # text = stockId+" "+Abbreviation # print(text) yield item
def aNewPage(self, response): print('正在爬取' + response.url + '信息') # 此时要将爬取到的数据存入item中了 # 引入from news.items import NewsItem 新建item对象 item = StockItem() # 观察页面中所需信息的xpath信息再利用xpath存入对应的item字段 item['url'] = [response.url] a = item['url'] # print(item['url'][:1]) # b='stock.10' # c='yicai' # d='21jingji' e = 'weixin' # f='jiemian' # g='wabei' # h='sohu' # if b in a[0]: # print(a[0]) # print('nih') # item['title'] = response.xpath('//h2[@class="main-title"]/text()').extract() # print(item['title']) # content = response.xpath('//div[@class="main-text atc-content"]/p/text()').extract() # # 注意爬取到的content为多段<p>标签组成 需要合并处理 # item['content'] = ["\n".join(content)][0].replace(u'\u3000', u' ') # print(item['content'] ) # elif c in a[0]: # print(c) # item['title'] = response.xpath('//div[@class="m-list7 f-white"]/h1/text()').extract() # print(item['title']) # content = response.xpath('//div[@class="txt"]//p/text()').extract() # # 注意爬取到的content为多段<p>标签组成 需要合并处理 # item['content'] = ["\n".join(content)][0].replace(u'\u3000', u' ') # print(item['content'] ) # # elif d in a[0]: # print(d) # item['title'] = response.xpath('//div[@class="titlehead"]/h1/text()').extract() # print(item['title']) # content = response.xpath('//div[@class="txtContent"]//p/text()').extract() # # 注意爬取到的content为多段<p>标签组成 需要合并处理 # item['content'] = ["\n".join(content)][0].replace(u'\u3000', u' ') # print(item['content'] ) if e in a[0]: print(e) item['title'] = response.xpath( '//h2[@class="rich_media_title"]/text()').extract() print(item['title']) content = response.xpath( '//div[@class="rich_media_content"]/span/text()').extract() # 注意爬取到的content为多段<p>标签组成 需要合并处理 item['content'] = ["\n".join(content)][0].replace(u'\u3000', u' ') print(item['content']) # # elif f in a[0]: # print(f) # item['title'] = response.xpath('//div[@class="article-header"]/h1/text()').extract() # print(item['title']) # content = response.xpath('//div[@class="article-content"]/p/text()').extract() # # 注意爬取到的content为多段<p>标签组成 需要合并处理 # item['content'] = ["\n".join(content)][0].replace(u'\u3000', u' ') # print(item['content'] ) # # elif g in a[0]: # print(g) # item['title'] = response.xpath('//div[@class="subject"]/h1/text()').extract() # print(item['title']) # content = response.xpath('//div[@class="subject-content"]/p/text()').extract() # # 注意爬取到的content为多段<p>标签组成 需要合并处理 # item['content'] = ["\n".join(content)][0].replace(u'\u3000', u' ') # print(item['content'] ) # # elif h in a[0]: # print(h) # item['title'] = response.xpath('//div[@class="text-title"]/h1/text()').extract() # print(item['title']) # content = response.xpath('//article[@class="article"]/p/text()').extract() # # 注意爬取到的content为多段<p>标签组成 需要合并处理 # item['content'] = ["\n".join(content)][0].replace(u'\u3000', u' ') # print(item['content'] ) # item['time'] = response.xpath('//div[@class="artical-info"]/span/a/span/text()').extract() # item['source'] = response.xpath('//div[@class="artical-info"]/span/span/a/text()').extract() # item['img'] = response.xpath('//div[@class="artical-importantPic"]/img/@src').extract() # content = response.xpath('//div[@class="main-text atc-content"]/p/text()').extract() # 注意爬取到的content为多段<p>标签组成 需要合并处理 # item['content'] = ["\n".join(content)][0].replace(u'\u3000',u' ') # 爬取到的数据交给pipelines处理 yield item
def parse_taichinh(self, response): print(11111) item = StockItem() if response.body: re_mack = re.match(r'.*&scode=(.*)&bizType.*', response.url) item['mack'] = re_mack.group(1) i = 1 for sel in response.xpath("//tr[@class='BR_rowHeader']/td[position() > 1]"): item['date'] = sel.xpath('./text()').extract_first('').strip() item['dthuthuanve_bh_va_ccdv'] = response.xpath( "//tr[@id='2216']/td[{}]/text()".format(i + 2)).extract_first('').strip() item['giavon_bh'] = response.xpath("//tr[@id='2207']/td[{}]/text()".format(i + 2)).extract_first('').strip() item['loinhuangopve_bh_va_ccdv'] = response.xpath( "//tr[@id='2217']/td[{}]/text()".format(i + 2)).extract_first('').strip() item['dthuhoatdong_tc'] = response.xpath("//tr[@id='2221']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['chiphi_tc'] = response.xpath("//tr[@id='2222']/td[{}]/text()".format(i + 2)).extract_first('').strip() item['chiphi_bh'] = response.xpath("//tr[@id='2227']/td[{}]/text()".format(i + 2)).extract_first('').strip() item['chiphi_qldn'] = response.xpath("//tr[@id='2224']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['loinhuanthuanve_hdkd'] = response.xpath("//tr[@id='2208']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['loinhuankhac'] = response.xpath("//tr[@id='2209']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['phanloinhuantucty_lkkd'] = response.xpath( "//tr[@id='2210']/td[{}]/text()".format(i + 2)).extract_first('').strip() item['tongloinhuanketoantrcthue'] = response.xpath( "//tr[@id='2211']/td[{}]/text()".format(i + 2)).extract_first('').strip() item['loinhuansauthue_tndn'] = response.xpath("//tr[@id='2212']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['loinhuansauthue_cuacongdongctyme'] = response.xpath( "//tr[@id='2214']/td[{}]/text()".format(i + 2)).extract_first('').strip() item['lai_cb_tren_cp'] = response.xpath("//tr[@id='2215']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['ts_nganhan'] = response.xpath("//tr[@id='3000']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['tienvacackhoan_td_tien'] = response.xpath( "//tr[@id='3003']/td[{}]/text()".format(i + 2)).extract_first('').strip() item['cackhoandautu_tcnh'] = response.xpath("//tr[@id='3004']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['cackhoanphaithunganhan'] = response.xpath( "//tr[@id='3005']/td[{}]/text()".format(i + 2)).extract_first('').strip() item['hangtonkho'] = response.xpath("//tr[@id='3006']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['taisannganhankhac'] = response.xpath("//tr[@id='3007']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['taisandaihan'] = response.xpath("//tr[@id='3001']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['taisancodinh'] = response.xpath("//tr[@id='3009']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['bds_dautu'] = response.xpath("//tr[@id='3010']/td[{}]/text()".format(i + 2)).extract_first('').strip() item['cackhoandautu_tcdh'] = response.xpath("//tr[@id='3011']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['tongcongtaisan'] = response.xpath("//tr[@id='2996']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['nophaitra'] = response.xpath("//tr[@id='2997']/td[{}]/text()".format(i + 2)).extract_first('').strip() item['nonganhan'] = response.xpath("//tr[@id='3014']/td[{}]/text()".format(i + 2)).extract_first('').strip() item['nodaihan'] = response.xpath("//tr[@id='3017']/td[{}]/text()".format(i + 2)).extract_first('').strip() item['vochusohuu'] = response.xpath("//tr[@id='2998']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['vondautucua_csh'] = response.xpath("//tr[@id='3063']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['thangduvon_cp'] = response.xpath("//tr[@id='3064']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['loinhuansauthue_chuapp'] = response.xpath( "//tr[@id='3072']/td[{}]/text()".format(i + 2)).extract_first('').strip() item['loiichcuacongdongthieuso'] = response.xpath( "//tr[@id='3002']/td[{}]/text()".format(i + 2)).extract_first('').strip() item['tongcongnguonvon'] = response.xpath("//tr[@id='2999']/td[{}]/text()".format(i + 2)).extract_first( '').strip() item['eps'] = response.xpath("//tr[@id='53']/td[{}]/text()".format(i + 3)).extract_first('').strip() item['bvps'] = response.xpath("//tr[@id='54']/td[{}]/text()".format(i + 3)).extract_first('').strip() item['p_e'] = response.xpath("//tr[@id='55']/td[{}]/text()".format(i + 3)).extract_first('').strip() item['p_b'] = response.xpath("//tr[@id='57']/td[{}]/text()".format(i + 3)).extract_first('').strip() item['tisuatloinhuangopbien'] = response.xpath("//tr[@id='41']/td[{}]/text()".format(i + 3)).extract_first( '').strip() item['tisuatsinhloitrendoanhthuthuan'] = response.xpath( "//tr[@id='44']/td[{}]/text()".format(i + 3)).extract_first('').strip() item['roea'] = response.xpath("//tr[@id='45']/td[{}]/text()".format(i + 3)).extract_first('').strip() item['roaa'] = response.xpath("//tr[@id='47']/td[{}]/text()".format(i + 3)).extract_first('').strip() item['tisothanhtoanhienhanh'] = response.xpath("//tr[@id='4']/td[{}]/text()".format(i + 3)).extract_first( '').strip() item['khanangthanhtoanvaylai'] = response.xpath("//tr[@id='5']/td[{}]/text()".format(i + 3)).extract_first( '').strip() item['tisonotrentrongtaisan'] = response.xpath("//tr[@id='8']/td[{}]/text()".format(i + 3)).extract_first( '').strip() item['tisonotrenvonchusohuu'] = response.xpath("//tr[@id='11']/td[{}]/text()".format(i + 3)).extract_first( '').strip() yield item