예제 #1
0
파일: race.py 프로젝트: qemtek/rpscrape
    def get_race_distances(self):
        dist = find(self.doc, 'span', 'block-distanceInd')
        dist_y = find(self.doc, 'span', 'block-fullDistanceInd').strip('()')

        try:
            dist_f = self.distance_to_furlongs(dist)
        except ValueError:
            print(
                f'ERROR: distance_to_furlongs(). Error: {ValueError}, dist: {dist}, dist_y: {dist}'
            )
            print('Race: ', self.url)
            dist_f = -1

        dist_m = self.distance_to_metres(dist_y)

        if dist_m == 0:
            dist_m = round(dist_f * 201.168)

        dist_y = round(dist_m * 1.0936)
        dist_f = str(dist_f).replace('.0', '') + 'f'

        if self.race_info['region'] not in {'GB', 'IRE', 'USA', 'CAN'}:
            dist_m = float(dist_f.strip('f')) * 200

        return dist, dist_y, dist_f, dist_m
예제 #2
0
파일: racecards.py 프로젝트: 4A47/rpscrape
def get_race_type(doc, race, distance):
    race_type = ''
    fences = find(doc, 'div', 'RC-headerBox__stalls')

    if 'hurdle' in fences.lower():
        race_type = 'Hurdle'
    elif 'fence' in fences.lower():
        race_type = 'Chase'
    else:
        if distance >= 12:
            if any(x in race for x in
                   {'national hunt flat', 'nh flat race', 'mares flat race'}):
                race_type = 'NH Flat'
            if any(
                    x in race for x in {
                        'inh bumper', ' sales bumper', 'kepak flat race',
                        'i.n.h. flat race'
                    }):
                race_type = 'NH Flat'
            if any(x in race for x in {' hurdle', '(hurdle)'}):
                race_type = 'Hurdle'
            if any(
                    x in race for x in {
                        ' chase', '(chase)', 'steeplechase', 'steeple-chase',
                        'steeplchase', 'steepl-chase'
                    }):
                race_type = 'Chase'

    if race_type == '':
        race_type = 'Flat'

    return race_type
예제 #3
0
파일: race.py 프로젝트: 4A47/rpscrape
    def get_num_runners(self):
        ran = find(self.doc, 'span',
                   'rp-raceInfo__value rp-raceInfo__value_black')

        if ran is not None:
            return ran.replace('ran', '').strip()

        return None
예제 #4
0
파일: race.py 프로젝트: 4A47/rpscrape
    def get_course(self, course_url):
        course = find(self.doc, 'h1', 'RC-courseHeader__name')
        if course == '':
            try:
                course = self.doc.xpath(
                    "//a[contains(@class, 'rp-raceTimeCourseName__name')]/text()"
                )[0].strip()
            except IndexError:
                course = course_url.title()

        return course
예제 #5
0
파일: race.py 프로젝트: 4A47/rpscrape
    def get_race_type(self):
        race_type = ''
        race = self.race_info['race_name'].lower()

        if self.race_info[
                'code'] == 'flat' and 'national hunt flat' not in race:
            race_type = 'Flat'
        else:
            fences = find(self.doc, 'span', 'rp-raceTimeCourseName_hurdles')

            if 'hurdle' in fences.lower():
                race_type = 'Hurdle'
            elif 'fence' in fences.lower():
                race_type = 'Chase'

        if race_type == '':
            if self.race_info['dist_m'] >= 2400:
                if any(
                        x in race for x in
                    {'national hunt flat', 'nh flat race', 'mares flat race'}):
                    race_type = 'NH Flat'
                if any(
                        x in race for x in {
                            'inh bumper', ' sales bumper', 'kepak flat race',
                            'i.n.h. flat race'
                        }):
                    race_type = 'NH Flat'
                if any(x in race for x in {' hurdle', '(hurdle)'}):
                    race_type = 'Hurdle'
                if any(
                        x in race for x in {
                            ' chase', '(chase)', 'steeplechase',
                            'steeple-chase', 'steeplchase', 'steepl-chase'
                        }):
                    race_type = 'Chase'

        if race_type == '':
            race_type = 'Flat'

        return race_type
예제 #6
0
파일: race.py 프로젝트: 4A47/rpscrape
    def parse_race_bands(self):
        band = find(self.doc,
                    'span',
                    'rp-raceTimeCourseName_ratingBandAndAgesAllowed',
                    property='class')
        bands = band.strip('()').split(',')

        band_age = ''
        band_rating = ''

        if len(bands) > 1:
            for x in bands:
                if 'yo' in x:
                    band_age = x.strip()
                elif '-' in x:
                    band_rating = x.strip()
        else:
            if 'yo' in band:
                band_age = band.strip()
            elif '-' in band:
                band_rating = band.strip()

        return band_age.strip('()'), band_rating
예제 #7
0
파일: race.py 프로젝트: 4A47/rpscrape
    def __init__(self, url, document, code, fields):
        self.url = url
        self.doc = document
        self.race_info = {}
        self.runner_info = {}

        url_split = self.url.split('/')

        self.race_info['code'] = code
        self.race_info['date'] = convert_date(url_split[6])
        self.race_info['course'] = self.get_course(url_split[5])
        self.race_info['course_id'] = url_split[4]
        self.race_info['region'] = get_region(url_split[4])
        self.race_info['race_id'] = url_split[7]

        self.race_info['going'] = find(self.doc,
                                       'span',
                                       'rp-raceTimeCourseName_condition',
                                       property='class')
        self.race_info['surface'] = get_surface(self.race_info['going'])
        self.race_info['off'] = find(self.doc, 'span', 'text-raceTime')
        self.race_info['race_name'] = find(self.doc,
                                           'h2',
                                           'rp-raceTimeCourseName__title',
                                           property='class')
        self.race_info['class'] = find(self.doc,
                                       'span',
                                       'rp-raceTimeCourseName_class',
                                       property='class').strip('()')
        self.race_info['race_name'] = self.clean(self.race_info['race_name'])

        if self.race_info['class'] == '':
            self.race_info['class'] = self.get_race_class()

        self.race_info['pattern'] = self.get_race_pattern()
        self.race_info['race_name'] = self.clean_race_name(
            self.race_info['race_name'])
        self.race_info['age_band'], self.race_info[
            'rating_band'] = self.parse_race_bands()

        if self.race_info[
                'class'] == '' and self.race_info['rating_band'] != '':
            self.race_info['class'] = self.get_class_from_rating()

        self.race_info['sex_rest'] = self.sex_restricted()
        self.race_info['dist'], self.race_info['dist_y'],\
        self.race_info['dist_f'], self.race_info['dist_m'] = self.get_race_distances()
        self.race_info['type'] = self.get_race_type()
        self.race_info['ran'] = self.get_num_runners()

        pedigree = Pedigree(
            xpath(self.doc, 'tr', 'block-pedigreeInfoFullResults', fn='/td'))

        self.runner_info['sire_id'] = pedigree.id_sires
        self.runner_info['sire'] = pedigree.sires
        self.runner_info['dam_id'] = pedigree.id_dams
        self.runner_info['dam'] = pedigree.dams
        self.runner_info['damsire_id'] = pedigree.id_damsires
        self.runner_info['damsire'] = pedigree.damsires
        self.runner_info['sex'] = self.get_sexs(pedigree.pedigrees)
        self.runner_info['comment'] = self.get_comments()
        self.runner_info['pos'] = self.get_positions()
        self.runner_info['prize'] = self.get_prizemoney()
        self.runner_info['draw'] = self.get_draws()
        self.runner_info['ovr_btn'], self.runner_info[
            'btn'] = self.get_distance_btn()
        self.runner_info['sp'] = self.get_starting_prices()
        self.runner_info['dec'] = self.get_decimal_odds()
        self.runner_info['num'] = self.get_numbers()

        if not self.race_info['ran']:
            self.race_info['ran'] = len(self.runner_info['num'])
        else:
            self.race_info['ran'] = int(self.race_info['ran'])

        self.runner_info['age'] = self.get_horse_ages()
        self.runner_info['horse'] = self.get_names_horse()
        self.runner_info['horse_id'] = self.get_ids_horse()
        self.runner_info['jockey'] = self.get_names_jockey()
        self.runner_info['jockey_id'] = self.get_ids_jockey()
        self.runner_info['trainer'] = self.get_names_trainer()
        self.runner_info['trainer_id'] = self.get_ids_trainer()
        self.runner_info['owner'] = self.get_names_owner()
        self.runner_info['owner_id'] = self.get_ids_owner()
        self.runner_info['hg'] = self.get_headgear()

        self.runner_info['wgt'], self.runner_info['lbs'] = self.get_weights()
        self.runner_info['or'] = xpath(self.doc,
                                       'td',
                                       'OR',
                                       'data-ending',
                                       fn='/text()')
        self.runner_info['rpr'] = xpath(self.doc,
                                        'td',
                                        'RPR',
                                        'data-ending',
                                        fn='/text()')
        self.runner_info['ts'] = xpath(self.doc,
                                       'td',
                                       'TS',
                                       'data-ending',
                                       fn='/text()')
        self.runner_info['silk_url'] = xpath(self.doc,
                                             'img',
                                             'rp-horseTable__silk',
                                             'class',
                                             fn='/@src')

        self.runner_info['time'] = self.get_finishing_times()
        self.runner_info['secs'] = self.time_to_seconds(
            self.runner_info['time'])

        self.clean_non_completions()

        self.csv_data = self.create_csv_data(fields)
예제 #8
0
파일: racecards.py 프로젝트: 4A47/rpscrape
def parse_races(session, race_urls, date):
    races = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

    going_info = get_going_info(session, date)

    for url in race_urls:
        r = session.get(url, headers=random_header.header())
        doc = html.fromstring(r.content)

        race = {}

        url_split = url.split('/')

        race['race_id'] = int(url_split[7])
        race['date'] = url_split[6]
        race['course_id'] = int(url_split[4])
        race['course'] = find(doc, 'h1', 'RC-courseHeader__name')
        race['off_time'] = find(doc, 'span', 'RC-courseHeader__time')
        race['race_name'] = find(doc, 'span', 'RC-header__raceInstanceTitle')
        race['distance_round'] = find(doc, 'strong',
                                      'RC-header__raceDistanceRound')
        race['distance'] = find(doc, 'span', 'RC-header__raceDistance')
        race['distance'] = race['distance_round'] if not race[
            'distance'] else race['distance'].strip('()')
        race['distance_f'] = distance_to_furlongs(race['distance_round'])
        race['region'] = get_region(str(race['course_id']))
        race['pattern'] = get_pattern(race['race_name'].lower())
        race['race_class'] = find(doc, 'span', 'RC-header__raceClass')
        race['race_class'] = race['race_class'].strip(
            '()') if race['race_class'] else ''
        race['type'] = get_race_type(doc, race['race_name'].lower(),
                                     race['distance_f'])

        if not race['race_class']:
            if race['pattern']:
                race['race_class'] = 'Class 1'

        try:
            band = find(doc, 'span', 'RC-header__rpAges').strip('()').split()
            if band:
                race['age_band'] = band[0]
                race['rating_band'] = band[1] if len(band) > 1 else None
            else:
                race['age_band'] = None
                race['rating_band'] = None
        except AttributeError:
            race['age_band'] = None
            race['rating_band'] = None

        prize = find(doc, 'div', 'RC-headerBox__winner').lower()
        race['prize'] = prize.split(
            'winner:')[1].strip() if 'winner:' in prize else None
        field_size = find(doc, 'div', 'RC-headerBox__runners').lower()
        if field_size:
            race['field_size'] = int(
                field_size.split('runners:')[1].split('(')[0].strip())
        else:
            race['field_size'] = ''

        try:
            race['going_detailed'] = going_info[race['course_id']]['going']
            race['rail_movements'] = going_info[
                race['course_id']]['rail_movements']
            race['stalls'] = going_info[race['course_id']]['stalls']
            race['weather'] = going_info[race['course_id']]['weather']
        except KeyError:
            race['going'] = None
            race['rail_movements'] = None
            race['stalls'] = None
            race['weather'] = None

        going = find(doc, 'div', 'RC-headerBox__going').lower()
        race['going'] = going.split(
            'going:')[1].strip().title() if 'going:' in going else ''

        race['surface'] = get_surface(race['going'])

        profile_hrefs = doc.xpath(
            "//a[@data-test-selector='RC-cardPage-runnerName']/@href")
        profile_urls = [
            'https://www.racingpost.com' + a.split('#')[0] + '/form'
            for a in profile_hrefs
        ]

        runners = get_runners(session, profile_urls)

        for horse in doc.xpath("//div[contains(@class, ' js-PC-runnerRow')]"):
            horse_id = int(
                find(horse, 'a', 'RC-cardPage-runnerName',
                     attrib='href').split('/')[3])

            if 'broken_url' in runners[horse_id]:
                sire = find(horse, 'a', 'RC-pedigree__sire').split('(')
                dam = find(horse, 'a', 'RC-pedigree__dam').split('(')
                damsire = find(
                    horse, 'a',
                    'RC-pedigree__damsire').lstrip('(').rstrip(')').split('(')

                runners[horse_id]['sire'] = clean_name(sire[0])
                runners[horse_id]['dam'] = clean_name(dam[0])
                runners[horse_id]['damsire'] = clean_name(damsire[0])

                runners[horse_id]['sire_region'] = sire[1].replace(')',
                                                                   '').strip()
                runners[horse_id]['dam_region'] = dam[1].replace(')',
                                                                 '').strip()
                runners[horse_id]['damsire_region'] = damsire[1].replace(
                    ')', '').strip()

                runners[horse_id]['age'] = find(horse,
                                                'span',
                                                'RC-cardPage-runnerAge',
                                                attrib='data-order-age')

                sex = find(horse, 'span', 'RC-pedigree__color-sex').split()

                runners[horse_id]['colour'] = sex[0]
                runners[horse_id]['sex_code'] = sex[1].capitalize()

                runners[horse_id]['trainer'] = find(
                    horse,
                    'a',
                    'RC-cardPage-runnerTrainer-name',
                    attrib='data-order-trainer')

            runners[horse_id]['number'] = int(
                find(horse,
                     'span',
                     'RC-cardPage-runnerNumber-no',
                     attrib='data-order-no'))

            try:
                runners[horse_id]['draw'] = int(
                    find(horse,
                         'span',
                         'RC-cardPage-runnerNumber-draw',
                         attrib='data-order-draw'))
            except ValueError:
                runners[horse_id]['draw'] = None

            runners[horse_id]['headgear'] = find(horse, 'span',
                                                 'RC-cardPage-runnerHeadGear')
            runners[horse_id]['headgear_first'] = find(
                horse, 'span', 'RC-cardPage-runnerHeadGear-first')

            try:
                runners[horse_id]['lbs'] = int(
                    find(horse,
                         'span',
                         'RC-cardPage-runnerWgt-carried',
                         attrib='data-order-wgt'))
            except ValueError:
                runners[horse_id]['lbs'] = None

            try:
                runners[horse_id]['ofr'] = int(
                    find(horse,
                         'span',
                         'RC-cardPage-runnerOr',
                         attrib='data-order-or'))
            except ValueError:
                runners[horse_id]['ofr'] = None

            try:
                runners[horse_id]['rpr'] = int(
                    find(horse,
                         'span',
                         'RC-cardPage-runnerRpr',
                         attrib='data-order-rpr'))
            except ValueError:
                runners[horse_id]['rpr'] = None

            try:
                runners[horse_id]['ts'] = int(
                    find(horse,
                         'span',
                         'RC-cardPage-runnerTs',
                         attrib='data-order-ts'))
            except ValueError:
                runners[horse_id]['ts'] = None

            claim = find(horse, 'span', 'RC-cardPage-runnerJockey-allowance')
            jockey = find(horse,
                          'a',
                          'RC-cardPage-runnerJockey-name',
                          attrib='data-order-jockey')

            if jockey:
                runners[horse_id][
                    'jockey'] = jockey if not claim else jockey + f'({claim})'
            else:
                runners[horse_id]['jockey'] = None

            try:
                runners[horse_id]['last_run'] = find(
                    horse, 'div', 'RC-cardPage-runnerStats-lastRun')
            except TypeError:
                runners[horse_id]['last_run'] = None

            runners[horse_id]['form'] = find(horse, 'span',
                                             'RC-cardPage-runnerForm')

            try:
                runners[horse_id]['trainer_rtf'] = find(
                    horse, 'span', 'RC-cardPage-runnerTrainer-rtf')
            except TypeError:
                runners[horse_id]['trainer_rtf'] = None

        race['runners'] = [runner for runner in runners.values()]
        races[race['region']][race['course']][race['off_time']] = race

    return races