Пример #1
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--output_dir',
                        action='store',
                        type=str,
                        default='.',
                        help='output directory filename')
    parser.add_argument('--ids',
                        action='store',
                        nargs='+',
                        type=str,
                        required=True,
                        help='target id list')
    args = parser.parse_args()

    output_dirname = args.output_dir
    target_ids = args.ids

    for id in tqdm(target_ids):
        url = "https://db.netkeiba.com/horse/{}/".format(id)
        html = getPage(url)
        result = getHorseRaceResults(html)
        sleep(0.2)

        output_filename = "horse_race_result_{}.csv".format(id)
        output_filename = os.path.join(output_dirname, output_filename)
        df_out = pd.DataFrame(
            result, columns=["date", "name", "place", "prize", "weight"])
        df_out.to_csv(output_filename, index=False)
Пример #2
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--year',
                        '-y',
                        action='store',
                        type=int,
                        required=True,
                        help='target year')
    parser.add_argument('-n',
                        action='store',
                        type=int,
                        default=None,
                        help='max sample')
    parser.add_argument('--output',
                        '-o',
                        action='store',
                        type=str,
                        default='jockey_leading.csv',
                        help='output filename')
    args = parser.parse_args()

    year = args.year
    max_sample = args.n
    output_filename = args.output
    results = []

    page = 1
    while (True):
        html = getPage(
            "http://db.netkeiba.com/?pid=jockey_leading&year=%d&page=%d" %
            (year, page))
        page_result = getJockeyResult(html, offset=len(results))
        print('get {}'.format(len(page_result)))
        results += page_result

        if len(page_result) == 0:
            break

        if max_sample and len(results) >= max_sample:
            results = results[:max_sample]
            break

        page = page + 1
        sleep(1)

    print(len(results))
    df = pd.DataFrame(results)
    reordered_cols = [
        'id', 'name', 'stable', 'win_count', 'second_place_count',
        'third_place_count', 'unplaced_count', 'grade_race_count',
        'grade_race_win', 'stakes_race_count', 'stakes_race_win',
        'general_race_count', 'general_race_win', 'turf_race_count',
        'turf_race_win', 'dart_race_count', 'dart_race_win', 'win_ratio',
        'in_second_place_ratio', 'in_third_place_ratio', 'prize'
    ]
    df = df[reordered_cols]
    df.to_csv(output_filename)
Пример #3
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--input',
                        '-i',
                        action='store',
                        type=str,
                        default=None,
                        help='horse data csv')
    parser.add_argument('--output',
                        '-o',
                        action='store',
                        type=str,
                        default='horse_data.additional.csv',
                        help='output filename')
    parser.add_argument('ids', type=str, nargs='+')
    args = parser.parse_args()

    input_filename = args.input
    output_filename = args.output

    ids = []
    if input_filename:
        df = pd.read_csv(input_filename)
        ids.extend(df['id'].values)
    ids.extend(args.ids)

    results = []
    for id in tqdm(ids):
        # print(id)
        url = "http://db.netkeiba.com/horse/{id}/".format(id=id)
        html = getPage(url)
        result = getHorseProfile(html)
        results.append(result)
        sleep(0.2)

    cols = [
        'id', 'name', 'sire', 'sire_id', 'mare', 'mare_id', 'bms', 'bms_id',
        'hair', 'sex', 'birth_date', 'trainer', 'trainer_id', 'owner',
        'owner_id', 'breeder', 'breeder_id', 'prize', 'race_result',
        'debut_weight', 'sales_price', 'relatives', 'maruchi', 'kakuchi'
    ]
    df_out = pd.DataFrame(results, columns=cols)
    df_out.to_csv(output_filename, index=False)
def main():
    parser = ArgumentParser()
    parser.add_argument('--output_dir', action='store', type=str, default='.', help='output directory filename')
    parser.add_argument('--ids', action='store', nargs='+', type=str, required=True, help='target id list')
    args = parser.parse_args()

    output_dirname = args.output_dir
    mare_ids = args.ids

    for id in tqdm(mare_ids):
        url = "https://db.netkeiba.com/horse/mare/{}/".format(id)
        html = getPage(url)
        result = getMareCrops(html)
        sleep(0.2)

        output_filename = "mare_crop_{}.csv".format(id)
        output_filename = os.path.join(output_dirname, output_filename)
        df_out = pd.DataFrame(result, columns=["year", "name", "horse_id", "sex", "sire"])
        df_out.to_csv(output_filename, index=False)
def main():
    parser = ArgumentParser()
    parser.add_argument('--input',
                        '-i',
                        action='store',
                        type=str,
                        default='horse_ranking.csv',
                        help='horse data csv')
    parser.add_argument('--output',
                        '-o',
                        action='store',
                        type=str,
                        default='horse_data.additional.csv',
                        help='output filename')
    args = parser.parse_args()

    input_filename = args.input
    output_filename = args.output

    df = pd.read_csv(input_filename)
    ids = df['id'].values

    results = []
    for id in tqdm(ids):
        # print(id)
        url = "http://db.netkeiba.com/horse/{id}/".format(id=id)
        html = getPage(url)
        result = getHorseAdditionalInfo(html)
        results.append(result)
        sleep(0.2)

    df_out = pd.DataFrame(results)

    reordered_cols = [
        'id', 'name', 'hair', 'birth_date', 'race_result', 'debut_weight',
        'sales_price', 'relatives'
    ]
    df_out = df_out[reordered_cols]
    df_out.to_csv(output_filename)
def main():
    parser = ArgumentParser()
    parser.add_argument('--input',
                        '-i',
                        action='store',
                        type=str,
                        default='horse_ranking.csv',
                        help='horse data csv')
    parser.add_argument('--output',
                        '-o',
                        action='store',
                        type=str,
                        default='mare_crop_data.csv',
                        help='output filename')
    parser.add_argument('--cache',
                        action='store',
                        type=str,
                        default='horse_id.pkl',
                        help='horse_id cache file')
    args = parser.parse_args()

    input_filename = args.input
    output_filename = args.output
    cache_filename = args.cache

    df = pd.read_csv(input_filename)
    mare_names = df['mare'].values
    bms_names = df['bms'].values
    # print(mare_names)

    if os.path.exists(cache_filename):
        # load horse_id cache
        horse_ids = pickle.load(open(cache_filename, 'rb'))
    else:
        horse_ids = {}

    results = []
    for mare, bms in tqdm(zip(mare_names, bms_names), total=len(mare_names)):
        if mare in horse_ids:
            horse_id = horse_ids[mare]
        else:
            if mare in mare_conv_tbl:
                mare = mare_conv_tbl[mare]

            horse_id = getHorseIdByName(mare, sire=bms, sex=[2])
            if not horse_id:
                # try in partial match mode..
                horse_id = getHorseIdByName2(mare, sire=bms, sex=[2])

            if not horse_id:
                if bms in bms_conv_tbl:
                    horse_id = getHorseIdByName2(mare,
                                                 sire=bms_conv_tbl[bms],
                                                 sex=[2])

            if not horse_id:
                # try without sire parameter
                horse_id = getHorseIdByName(mare, sex=[2])
                print("WARNING: horse_id is found without sire parameter ({})".
                      format(mare))

            if not horse_id:
                print("WARNING: horse_id is not found ({})".format(mare))
                continue

            horse_ids[mare] = horse_id

            # update cache
            pickle.dump(horse_ids, open(cache_filename, 'wb'))

        # skip if we already checked
        if mare in [res['name'] for res in results]:
            continue

        url = "https://db.netkeiba.com/horse/{id}/".format(id=horse_id)
        html = getPage(url)
        result = getMareCropsResult(html)
        # print(result)
        results.append(result)
        sleep(0.2)

    df_out = pd.DataFrame(results)
    reordered_cols = [
        'id', 'name', 'birth_date', 'race_result', 'crop_count',
        'crop_win_count', 'crop_grade_horse_count', 'crop_grade_win_count'
    ]
    df_out = df_out[reordered_cols]
    df_out.to_csv(output_filename)
            'name': name,
            'horse_id': horse_id,
            'sex': sex,
            'sire': sire
        })
    return crops[::-1]


if __name__ == "__main__":
    # html = getPage("https://db.netkeiba.com/horse/2014102565/")
    # html = getPage("https://db.netkeiba.com/horse/2015102894/")
    # html = getPage("https://db.netkeiba.com/horse/2014106083/")
    # html = getPage("https://db.netkeiba.com/horse/2016100893/")
    # html = getPage("https://db.netkeiba.com/horse/2016103387/")
    # html = getPage("https://db.netkeiba.com/horse/2016104532/")
    html = getPage("https://db.netkeiba.com/horse/2001100925/")
    result = getHorseProfile(html)

    # html = getPage("https://db.netkeiba.com/horse/2004104258/")
    # html = getPage("https://db.netkeiba.com/horse/1992108561/")
    # html = getPage("https://db.netkeiba.com/horse/2000106445/")
    # html = getPage("https://db.netkeiba.com/horse/2004102429/")
    # html = getPage("https://db.netkeiba.com/horse/000a013c70")
    # html = getPage("https://db.netkeiba.com/horse/000a011df8/")
    # result = getMareCropsResult(html)

    # result = getHorseIdByName('オルフェーヴル')
    # result = getHorseIdByName('スティンガー')
    # result = getHorseIdByName('トリプレックス')
    # result = getHorseIdByName('ラッキーライラック', sex=[2])
Пример #8
0
    breeder_id = link.get("href").split('/')[-2]
    name = link.get("title").replace('の近走成績', '')

    result = {'alt_id': breeder_id, 'name': name}
    return result


if __name__ == "__main__":
    # html = getPage("http://db.netkeiba.com/horse/2014102565/")
    # html = getPage("http://db.netkeiba.com/horse/2014106083/")
    # result = getHorseAdditionalInfo(html)

    # html = getPage("http://db.netkeiba.com/horse/2004104258/")
    # html = getPage("http://db.netkeiba.com/horse/1992108561/")
    # html = getPage("http://db.netkeiba.com/horse/2000106445/")
    # html = getPage("http://db.netkeiba.com/horse/2004102429/")
    # result = getMareCropsResult(html)

    # result = getHorseIdByName('オルフェーヴル')
    # result = getHorseIdByName('スティンガー')
    # result = getHorseIdByName('トリプレックス')
    # result = getHorseIdByName('ラッキーライラック', sex=[2])

    # result = getHorseIdByName2('ベラドーラII')
    # result = getHorseIdByName2('Debit Or Credit')

    html = getPage("http://db.netkeiba.com/breeder/373126/")
    result = getBreederId(html)

    print(result)