class Fund: def __init__(self): self.Sql = Sql() self.db_conn = self.Sql.conn_db(db='fund') def main(self, driver): total_page = driver.find_element_by_class_name('nv').text total_page = int(re.search('\d+', total_page).group()) print('total page:', total_page) sql = 'select crawledPage from crawl_info where spiderName = "fund"' crawledPage = int( self.Sql.exec_sql(self.db_conn, sql).fetchall()[0][0]) current_page = crawledPage if current_page == total_page: return for i in range(total_page): if i < current_page: next_page = driver.find_elements_by_xpath( "//div[@id='pager']/span[@class='nu page']")[-1] next_page.click() time.sleep(10) continue try: fund_ids = driver.find_elements_by_class_name('bzdm') for fund_id in fund_ids: fund_id = fund_id.text sql = 'insert into fund(fund) select "{0}" from dual where "{0}" not in(select fund from fund)'.format( fund_id) self.Sql.exec_sql(self.db_conn, sql) current_page += 1 print('Crawled Page {}'.format(current_page)) sql = 'update crawl_info set crawledPage = {} where spiderName = "fund"'.format( current_page) self.Sql.exec_sql(self.db_conn, sql) next_page = driver.find_elements_by_xpath( "//div[@id='pager']/span[@class='nu page']")[-1] next_page.click() time.sleep(10) except Exception as reason: print('Spider Crawl Failed in Page {0}, {1}'.format( current_page, str(reason)))
# all fund code import requests from lxml import etree from sql import Sql Sql = Sql() db_conn = Sql.conn_db('fund') url = 'http://fund.eastmoney.com/allfund.html' r = requests.get(url) r.encoding = 'gb2312' html = r.text html = etree.HTML(html) num_boxes = html.xpath('//div[@id="code_content"]//div[@class="num_box"]') allfund = [] for num_box in [num_boxes[0]]: lies = num_box.xpath( '//div[@id="code_content"]//div[@class="num_box"]/ul/li') for li in [lies[0]]: funds = li.xpath( '//div[@id="code_content"]//div[@class="num_box"]/ul/li/div/a[1]/text()' ) for fund in funds: print(fund) code = fund.split(')')[0][1:] name = fund.split(')')[1] sql = 'insert into fund(code, name) values ("{}", "{}")'.format( code, name) Sql.exec_sql(db_conn, sql)
# https://www.fnscore.com/detail/league/kog-2/league-kog-647.html import requests from bs4 import BeautifulSoup from sql import Sql Sql = Sql() db_conn = Sql.conn_db('wzmatch') url = 'https://www.fnscore.com/detail/league/kog-2/league-kog-647.html' responce = requests.get(url) html = responce.text data = [] soup = BeautifulSoup(html, 'html.parser') matches = soup.find_all('div', 'match-panel-container')[1].find_all('div', 'match-panel-item match-table-item') for match in matches: match_info = match.find_all('p') matchTime = match_info[0].get_text() teams = match.find_all('div', 'team') teamA = teams[0].find('p').get_text() teamB= teams[1].find('p').get_text() score = match_info[2].get_text() BO = match_info[4].get_text() data.append([matchTime,teamA,teamB,score,BO]) sql = 'insert into kpl2020_autumn(matchTime,teamA,teamB,score,BO) values(%s,%s,%s,%s,%s)' Sql.exec_sql(db_conn, sql, data)
from bs4 import BeautifulSoup from sql import Sql Sql = Sql() db_conn = Sql.conn_db('lolgamequiz') url = 'https://lpl.qq.com/es/worlds/2020/' html = """ <div class="swiper-wrapper" id="team_list" style="transform: translate3d(0px, 0px, 0px);"><a href="//lpl.qq.com/es/team_detail.shtml?tid=29" target="_blank" class="swiper-slide swiper-slide-active" onclick="PTTSendClick('btn','btn-team1','队伍');"> <div> <img src="//img.crawler.qq.com/lolwebvideo/20190523093050/f63d37cbc3810f2f9e8fb5688dd40254/0" alt=""> </div> <p>LPL赛区:<span>JDG</span></p> </a><a href="//lpl.qq.com/es/team_detail.shtml?tid=41" target="_blank" class="swiper-slide swiper-slide-next" onclick="PTTSendClick('btn','btn-team1','队伍');"> <div> <img src="//img.crawler.qq.com/lolwebvideo/20190523093521/b753e24c05cc53123ce5fa3f3a19162f/0" alt=""> </div> <p>LPL赛区:<span>SN</span></p> </a><a href="//lpl.qq.com/es/team_detail.shtml?tid=4" target="_blank" class="swiper-slide" onclick="PTTSendClick('btn','btn-team1','队伍');"> <div> <img src="//img.crawler.qq.com/lolwebvideo/20190523093621/b1721b1e247c18bab54a548775a887a5/0" alt=""> </div> <p>LPL赛区:<span>LGD</span></p> </a><a href="//lpl.qq.com/es/team_detail.shtml?tid=117" target="_blank" class="swiper-slide" onclick="PTTSendClick('btn','btn-team1','队伍');"> <div> <img src="//img.crawler.qq.com/lolwebvideo/20190919151523/2a9931322ed5750213ab6204adadaec1/0" alt=""> </div> <p>LEC赛区:<span>G2</span></p> </a><a href="//lpl.qq.com/es/team_detail.shtml?tid=42" target="_blank" class="swiper-slide" onclick="PTTSendClick('btn','btn-team1','队伍');"> <div> <img src="//game.gtimg.cn/images/lpl/act/a20200901worlds/c6-team2.png" alt=""> </div>
class JJJZ: def __init__(self): self.Sql = Sql() self.Driver = Driver() self.db_conn = self.Sql.conn_db(db='fund') def main(self): driver = self.Driver.main() root = 'http://fundf10.eastmoney.com/jjjz_' sql = 'select id,fund from fund where id > (select ifnull(crawledFundId,0) from crawl_info where spiderName = "jjjz")' fund_id_li = self.Sql.exec_sql(self.db_conn, sql) driver.implicitly_wait(2) for fund in fund_id_li: id = fund[0] fund = fund[1] jjjz_page = 0 url = root + fund + '.html' driver.get(url) jjjz_total_page = driver.find_elements_by_xpath( '//div[@class="pagebtns"]/label')[-2].text #sql = 'select crawledPage from crawl_info where spiderName = “jjjz”' #jjjz_page = int(self.Sql.exec(self.db_conn, sql)) data = [] for i in range(int(jjjz_total_page)): trs = driver.find_elements_by_xpath( '//div[@id="jztable"]/table/tbody/tr') for i in range(len(trs)): date = driver.find_element_by_xpath( '//div[@id="jztable"]/table/tbody/tr[{0}]/td[1]'. format(i + 1)).text.strip() unit_jz = driver.find_element_by_xpath( '//div[@id="jztable"]/table/tbody/tr[{0}]/td[2]'. format(i + 1)).text.strip() total_jz = driver.find_element_by_xpath( '//div[@id="jztable"]/table/tbody/tr[{0}]/td[3]'. format(i + 1)).text.strip() date_rate = driver.find_element_by_xpath( '//div[@id="jztable"]/table/tbody/tr[{0}]/td[4]'. format(i + 1)).text.strip() buy_status = driver.find_element_by_xpath( '//div[@id="jztable"]/table/tbody/tr[{0}]/td[5]'. format(i + 1)).text.strip() sale_status = driver.find_element_by_xpath( '//div[@id="jztable"]/table/tbody/tr[{0}]/td[6]'. format(i + 1)).text.strip() red = driver.find_element_by_xpath( '//div[@id="jztable"]/table/tbody/tr[{0}]/td[7]'. format(i + 1)).text.strip() data.append([ id, date, unit_jz, total_jz, date_rate, buy_status, sale_status, red ]) print('Crawling Fund {0}, Crawled Page {1}'.format( fund, jjjz_page)) jjjz_page += 1 jjjz_next_page = driver.find_elements_by_xpath( '//div[@class="pagebtns"]/label')[-1] jjjz_next_page.click() time.sleep(1) sql = "insert into jjjz(fundId,JZDate,unitJZ,totalJZ,dateRate,buyStatus,saleStatus,red) values (%s,%s,%s,%s,%s,%s,%s,%s)" self.Sql.exec_sql(self.db_conn, sql, data) sql = 'update crawl_info set crawledFundId = {} where spiderName = "jjjz"'.format( id) self.Sql.exec_sql(self.db_conn, sql)