def cra_main_page(): print("主页抓取:") # 先用测试数据 # content = testData.str content = cra_data_url("http://odds.500.com/yazhi_jczq.shtml") # print(soup.select("#main-tbody input[type='checkbox']")) selector = etree.HTML(content) # 编号 elemtsNo = selector.xpath('//tbody[@id="main-tbody"]' '//input[@type="checkbox"]/@value') # 第一列 场次 elemts1 = selector.xpath('//tbody[@id="main-tbody"]' '//input[@type="checkbox"]/../text()') # 比赛时间 elemtsTime = selector.xpath( '//tbody[@id="main-tbody"]//input[@type="checkbox"]' '/../../../@date-dtime') # 隐藏的第4列 比赛队名1 elemtsTeam1 = selector.xpath( '//tbody[@id="main-tbody"]//input[@type="checkbox"]' '/../../following-sibling::*[4]/a/text()') # 隐藏的第六列 比赛队名2 elemtsTeam2 = selector.xpath( '//tbody[@id="main-tbody"]//input[@type="checkbox"]' '/../../following-sibling::*[6]/a/text()') # elNo, el1, elTime, name1, name2 # 可以开始存库了 if elemtsNo and len(elemtsNo) > 0: exe_result = DBUtils.executeMany( "replace into football_game_info values (%s, %s, %s, %s, %s)", zip(elemtsNo, elemts1, elemtsTime, elemtsTeam1, elemtsTeam2)) print("主页保存条数:%s" % exe_result) else: print("主页抓取失败:%s" % content)
def process_sub_page3( testElemNo=None, first_tmp_url="http://odds.500.com/fenxi/rangqiu-%s.shtml?lot=jczq", next_tmp_url="http://odds.500.com/fenxi1/rangqiu.php?id=%s&ctype=1&" "start=%d&r=1&style=0&guojia=0&chupan=0&lot=jczq", table_name="rangqiu_startvalue_info"): startTotalTime = time.time() if testElemNo is not None: elemNoList = testElemNo else: elemNoList = DBUtils.execute( "select game_no,start_datetime from football_game_info " "where start_datetime>now() order by start_datetime ") all_sub_data = [] str_sql = "insert into " + table_name + "(kelly_win_num, kelly_avg_num, " \ "kelly_lost_num,game_id) values(%s,%s,%s,%s) " # 设置解析列索引 params = {} if table_name != "pay_info": params = {"col_index": 2} if not isinstance(elemNoList, list): print("不是数组:", end="") print(elemNoList) return [] """ 解析让球指数页面 """ for elObj in elemNoList: startTime = time.time() time.sleep(ConstantVal.TIMER_INTERVAL) elNo = elObj["game_no"] url = first_tmp_url % elNo # 子页面第一页的数据,存在多页的情况 all_grade_data = analysis_all_sub_page3(cra_data_url(url, "utf-8"), **params) # 子页面其他页的数据 start = 30 while True: url = next_tmp_url % (elNo, start) htmlData = cra_data_url(url, "utf-8") if htmlData and len(htmlData.strip()) > 100: result = analysis_all_sub_page3(htmlData, 'tr', **params) all_grade_data = concat_array(all_grade_data, result) start += 30 else: break # 计算平方差 if len(all_grade_data) > 0: avg_data = [ str(float('%.3f' % cal_std2(bb))) for bb in all_grade_data ] avg_data.append(elNo) all_sub_data.append(avg_data) # count = DBUtils.executeOne(str_sql, avg_data) print( str(elNo) + " " + table_name + " 子页总共抓取条数:%s 耗时:%s" % (start, utils.float_num(time.time() - startTime))) count = DBUtils.executeMany(str_sql, all_sub_data) print(all_sub_data) print("子页面保存数量:%s 耗时:%s" % (count, utils.float_num(time.time() - startTotalTime))) return elemNoList
def insert_std_data(std_data): """ 保存比分的标准差 """ return DBUtils.executeMany( "INSERT INTO bifen_index_std(win_type,m1_0,m2_0,m2_1,m3_0,m3_1,m3_2" ",m4_0,m4_1,m4_2,m4_3,m0_0,m1_1,m2_2,m3_3,m4_4," "game_id)VALUES(" + "%s," * 16 + "%s" + ") ", std_data)
def parse_bifen(game_id=729379): time.sleep(ConstantVal.TIMER_INTERVAL) startTime = time.time() bifen_url = "http://odds.500.com/fenxi/bifen-%s.shtml" % game_id bifen_html = cra_data.cra_data_url(bifen_url) # 只解析class 为pub_table的table 即可 only_table = SoupStrainer("table", class_="pub_table") soup = BeautifulSoup(bifen_html, "lxml", parse_only=only_table) if not soup: print("%s 内容无效" % game_id) return # 获取比分,find搜索到一个就会停止搜索,搜索全部要使用 find_all scores_th = soup.find('tr').contents[5:] scores = [text.get_text() for text in scores_th if type(text) is bs4.element.Tag] # 0:0的位置 dogfall_index = scores.index("0:0") # 所有赔率 # 威廉希尔:293、Bwin:5、Bet365:3、Eurobet:15 1Bet:671、金宝博:348、澳门:11、18Bet:863 # 用于排序 老的 ["威廉希尔", "Bet365", "Bwin", "澳门"] + ["Eurobet", "1Bet", "金宝博"] specify_cids = [293, 5, 3, 15]+[863, 348, 11] main_comp = ["威廉希尔", "Bwin", "Bet365", "Eurobet"] + ["18Bet", "金宝博", "澳门"] # 用于在网页上查找所有 有class属性的tr all_trs = soup.find_all("tr", {"class": re.compile(".*")}) find_table_bifen_result = [] main_win_data = [] next_win_data = [] col_pay_data = [] for tr in all_trs: # 获取编号 cid = parse_qs(urlparse(tr.a.get('href')).query)['cid'] # 获取横着的整行数据 temp_list = list(tr.stripped_strings) # 平局赔付 avg_pay_list = temp_list[(dogfall_index + 2) * 2:] com_name = temp_list[1] order_num = [(main_comp.index(com_name) + 1) * 10 if com_name in main_comp else 100] # 主胜的赔率 main_win_pay = [ps for ps in temp_list[2:(dogfall_index + 2) * 2][::2]] # 客胜的赔率 next_win_pay = [ps for ps in temp_list[2:(dogfall_index + 2) * 2][1::2]] # 主胜 temp_list_main = [game_id] + cid + temp_list[:2] + order_num + main_win_pay + avg_pay_list # 客胜 temp_list_next = [game_id] + cid + temp_list[:2] + order_num + next_win_pay + avg_pay_list # 将客胜、平局、主胜的赔率存入数组 col_pay_data.append(main_win_pay[1:] + avg_pay_list + next_win_pay[1:]) main_win_data.append(temp_list_main) next_win_data.append(temp_list_next) # if com_name in com_name and cid and int(cid[0]) in specify_cids: find_table_bifen_result.append(temp_list_main) find_table_bifen_result.append(temp_list_next) # 比分存入数据库 if find_table_bifen_result.__len__() > 0: result_count = DBUtils.executeMany( "insert into bifen_index(game_id,`cid`,`html_index`,`com_name`,order_num,win_type,`m1_0`,`m2_0`,`m2_1`" ",`m3_0`,`m3_1`," "`m3_2`,`m4_0`,`m4_1`,`m4_2`,`m4_3`,`m0_0`,`m1_1`,`m2_2`,`m3_3`,`m4_4`) values (%s,%s,%s,%s,%s," "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", find_table_bifen_result) print("%s bifen存库条数:%s 耗时:%s" % (game_id, result_count, utils.float_num(time.time() - startTime))) # 计算波胆的标准差,然后存入到标准差表 if col_pay_data and len(col_pay_data) > 0: # 计算波胆平方差 calc_bodan_std_result = bodan_std(col_pay_data) result = bifenImpl.insert_bodan_std([game_id] + calc_bodan_std_result) print("%s 保存波胆标准差数据条数:%s" % (game_id, result))