Пример #1
0
def cra_main_page():
    print("主页抓取:")
    # 先用测试数据
    # content = testData.str
    content = cra_data_url("http://odds.500.com/yazhi_jczq.shtml")
    # print(soup.select("#main-tbody input[type='checkbox']"))
    selector = etree.HTML(content)

    # 编号
    elemtsNo = selector.xpath('//tbody[@id="main-tbody"]'
                              '//input[@type="checkbox"]/@value')
    # 第一列 场次
    elemts1 = selector.xpath('//tbody[@id="main-tbody"]'
                             '//input[@type="checkbox"]/../text()')
    # 比赛时间
    elemtsTime = selector.xpath(
        '//tbody[@id="main-tbody"]//input[@type="checkbox"]'
        '/../../../@date-dtime')
    # 隐藏的第4列 比赛队名1
    elemtsTeam1 = selector.xpath(
        '//tbody[@id="main-tbody"]//input[@type="checkbox"]'
        '/../../following-sibling::*[4]/a/text()')
    # 隐藏的第六列 比赛队名2
    elemtsTeam2 = selector.xpath(
        '//tbody[@id="main-tbody"]//input[@type="checkbox"]'
        '/../../following-sibling::*[6]/a/text()')
    # elNo, el1, elTime, name1, name2
    # 可以开始存库了
    if elemtsNo and len(elemtsNo) > 0:
        exe_result = DBUtils.executeMany(
            "replace into football_game_info values (%s, %s, %s, %s, %s)",
            zip(elemtsNo, elemts1, elemtsTime, elemtsTeam1, elemtsTeam2))
        print("主页保存条数:%s" % exe_result)
    else:
        print("主页抓取失败:%s" % content)
Пример #2
0
def process_sub_page3(
        testElemNo=None,
        first_tmp_url="http://odds.500.com/fenxi/rangqiu-%s.shtml?lot=jczq",
        next_tmp_url="http://odds.500.com/fenxi1/rangqiu.php?id=%s&ctype=1&"
    "start=%d&r=1&style=0&guojia=0&chupan=0&lot=jczq",
        table_name="rangqiu_startvalue_info"):
    startTotalTime = time.time()
    if testElemNo is not None:
        elemNoList = testElemNo
    else:
        elemNoList = DBUtils.execute(
            "select game_no,start_datetime from football_game_info "
            "where start_datetime>now() order by start_datetime ")
    all_sub_data = []
    str_sql = "insert into " + table_name + "(kelly_win_num, kelly_avg_num, " \
                                            "kelly_lost_num,game_id) values(%s,%s,%s,%s) "

    # 设置解析列索引
    params = {}
    if table_name != "pay_info":
        params = {"col_index": 2}
    if not isinstance(elemNoList, list):
        print("不是数组:", end="")
        print(elemNoList)
        return []
    """  解析让球指数页面 """
    for elObj in elemNoList:
        startTime = time.time()
        time.sleep(ConstantVal.TIMER_INTERVAL)
        elNo = elObj["game_no"]

        url = first_tmp_url % elNo
        # 子页面第一页的数据,存在多页的情况
        all_grade_data = analysis_all_sub_page3(cra_data_url(url, "utf-8"),
                                                **params)

        # 子页面其他页的数据
        start = 30
        while True:
            url = next_tmp_url % (elNo, start)
            htmlData = cra_data_url(url, "utf-8")
            if htmlData and len(htmlData.strip()) > 100:
                result = analysis_all_sub_page3(htmlData, 'tr', **params)
                all_grade_data = concat_array(all_grade_data, result)
                start += 30
            else:
                break

        # 计算平方差
        if len(all_grade_data) > 0:
            avg_data = [
                str(float('%.3f' % cal_std2(bb))) for bb in all_grade_data
            ]
            avg_data.append(elNo)
            all_sub_data.append(avg_data)
            # count = DBUtils.executeOne(str_sql, avg_data)
        print(
            str(elNo) + " " + table_name + " 子页总共抓取条数:%s  耗时:%s" %
            (start, utils.float_num(time.time() - startTime)))
    count = DBUtils.executeMany(str_sql, all_sub_data)
    print(all_sub_data)
    print("子页面保存数量:%s 耗时:%s" %
          (count, utils.float_num(time.time() - startTotalTime)))
    return elemNoList
Пример #3
0
def insert_std_data(std_data):
    """ 保存比分的标准差 """
    return DBUtils.executeMany(
        "INSERT INTO bifen_index_std(win_type,m1_0,m2_0,m2_1,m3_0,m3_1,m3_2"
        ",m4_0,m4_1,m4_2,m4_3,m0_0,m1_1,m2_2,m3_3,m4_4,"
        "game_id)VALUES(" + "%s," * 16 + "%s" + ") ", std_data)
Пример #4
0
def parse_bifen(game_id=729379):
    time.sleep(ConstantVal.TIMER_INTERVAL)
    startTime = time.time()
    bifen_url = "http://odds.500.com/fenxi/bifen-%s.shtml" % game_id
    bifen_html = cra_data.cra_data_url(bifen_url)
    # 只解析class 为pub_table的table 即可
    only_table = SoupStrainer("table", class_="pub_table")
    soup = BeautifulSoup(bifen_html, "lxml", parse_only=only_table)
    if not soup:
        print("%s 内容无效" % game_id)
        return
    # 获取比分,find搜索到一个就会停止搜索,搜索全部要使用 find_all
    scores_th = soup.find('tr').contents[5:]
    scores = [text.get_text() for text in scores_th if type(text) is bs4.element.Tag]

    # 0:0的位置
    dogfall_index = scores.index("0:0")
    # 所有赔率
    #  威廉希尔:293、Bwin:5、Bet365:3、Eurobet:15   1Bet:671、金宝博:348、澳门:11、18Bet:863
    # 用于排序 老的 ["威廉希尔", "Bet365", "Bwin", "澳门"] + ["Eurobet", "1Bet", "金宝博"]
    specify_cids = [293, 5, 3, 15]+[863, 348, 11]
    main_comp = ["威廉希尔", "Bwin", "Bet365", "Eurobet"] + ["18Bet", "金宝博", "澳门"]

    # 用于在网页上查找所有 有class属性的tr
    all_trs = soup.find_all("tr", {"class": re.compile(".*")})

    find_table_bifen_result = []
    main_win_data = []
    next_win_data = []
    col_pay_data = []
    for tr in all_trs:
        # 获取编号
        cid = parse_qs(urlparse(tr.a.get('href')).query)['cid']
        # 获取横着的整行数据
        temp_list = list(tr.stripped_strings)

        # 平局赔付
        avg_pay_list = temp_list[(dogfall_index + 2) * 2:]
        com_name = temp_list[1]
        order_num = [(main_comp.index(com_name) + 1) * 10 if com_name in main_comp else 100]
        # 主胜的赔率
        main_win_pay = [ps for ps in temp_list[2:(dogfall_index + 2) * 2][::2]]
        # 客胜的赔率
        next_win_pay = [ps for ps in temp_list[2:(dogfall_index + 2) * 2][1::2]]
        # 主胜
        temp_list_main = [game_id] + cid + temp_list[:2] + order_num + main_win_pay + avg_pay_list
        # 客胜
        temp_list_next = [game_id] + cid + temp_list[:2] + order_num + next_win_pay + avg_pay_list

        # 将客胜、平局、主胜的赔率存入数组
        col_pay_data.append(main_win_pay[1:] + avg_pay_list + next_win_pay[1:])

        main_win_data.append(temp_list_main)
        next_win_data.append(temp_list_next)
        #
        if com_name in com_name and cid and int(cid[0]) in specify_cids:
            find_table_bifen_result.append(temp_list_main)
            find_table_bifen_result.append(temp_list_next)

    # 比分存入数据库
    if find_table_bifen_result.__len__() > 0:
        result_count = DBUtils.executeMany(
            "insert into bifen_index(game_id,`cid`,`html_index`,`com_name`,order_num,win_type,`m1_0`,`m2_0`,`m2_1`"
            ",`m3_0`,`m3_1`,"
            "`m3_2`,`m4_0`,`m4_1`,`m4_2`,`m4_3`,`m0_0`,`m1_1`,`m2_2`,`m3_3`,`m4_4`) values (%s,%s,%s,%s,%s,"
            "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", find_table_bifen_result)
        print("%s bifen存库条数:%s 耗时:%s" % (game_id, result_count, utils.float_num(time.time() - startTime)))
    # 计算波胆的标准差,然后存入到标准差表
    if col_pay_data and len(col_pay_data) > 0:
        # 计算波胆平方差
        calc_bodan_std_result = bodan_std(col_pay_data)
        result = bifenImpl.insert_bodan_std([game_id] + calc_bodan_std_result)
        print("%s 保存波胆标准差数据条数:%s" % (game_id, result))