예제 #1
0
import sys
from functions import tournaments, array2csv

# Command line input
start_year = str(sys.argv[1])
end_year = str(sys.argv[2])

# Iterate through the years and scrape tourney data

print('')
print('Year    Tournaments')
print('----    -----------')

tourney_data = []
for h in range(int(start_year), int(end_year) + 1):
    year = str(h)
    tourney_data += tournaments(year)

# Output to CSV
filename = 'tournaments_' + start_year + '-' + end_year
array2csv(tourney_data, filename)
from functions import html_parse_tree, xpath_parse, regex_strip_array, array2csv

weeks_url = "http://www.atpworldtour.com/en/rankings/singles"
weeks_tree = html_parse_tree(weeks_url)
weeks_xpath = "//ul[@data-value = 'rankDate']/li/@data-value"
weeks_parsed = xpath_parse(weeks_tree, weeks_xpath)
weeks_cleaned = regex_strip_array(weeks_parsed)
weeks_list = [[week] for week in weeks_cleaned]
# Output to CSV
filename = 'weeks'
array2csv(weeks_list, filename)
예제 #3
0
    # Match stats URL XPath
    match_stats_url_xpath = tourney_match_count_xpath = "//table[contains(@class, 'day-table')]/tbody[*]/tr[*]/td[contains(@class, 'day-table-score')]/a/@href"
    match_stats_url_cleaned = xpath_parse(tourney_tree, match_stats_url_xpath)

    # Filter problematic URL's
    match_stats_url_suffixes = []
    for foo in match_stats_url_cleaned:
        if foo.find('//') == -1:
            match_stats_url_suffixes.append(foo)

    # STEP 2: Parse match stats
    if len(match_stats_url_suffixes) > 0:

        # Parse match stats asynchronously
        match_stats_data_scrape += asynchronous(match_stats_url_suffixes,
                                                scrape_match_stats,
                                                tourney_index, tourney_slug)

        # Parse match stats synchronously
        #match_stats_data_scrape += synchronous(match_stats_url_suffixes, scrape_match_stats, tourney_index, tourney_slug)

    else:
        spacing1 = format_spacing(5, tourney_index)
        spacing2 = format_spacing(15, tourney_slug)
        print tourney_index + spacing1 + '    ' + tourney_slug + spacing2 + '    Match stats URL problems'

    # STEP 3: Output to CSV
    filename = "match_stats_" + year + "_" + start_index
    array2csv(match_stats_data_scrape, filename)
    tourney_url_suffix_split = tourney_url_suffixes[i].split('/')
    tourney_slug = tourney_url_suffix_split[4]

    # Match stats URL XPath
    match_stats_url_xpath = tourney_match_count_xpath = "//table[contains(@class, 'day-table')]/tbody[*]/tr[*]/td[contains(@class, 'day-table-score')]/a/@href"
    match_stats_url_cleaned = xpath_parse(tourney_tree, match_stats_url_xpath)

    # Filter problematic URL's
    match_stats_url_suffixes = []
    for foo in match_stats_url_cleaned:
        if foo.find('//') == -1:
            match_stats_url_suffixes.append(foo)

    # STEP 2: Parse match stats
    if len(match_stats_url_suffixes) > 0:

        # Parse match stats asynchronously
        match_stats_data_scrape += asynchronous(match_stats_url_suffixes, scrape_match_stats, tourney_index, tourney_slug)

        # Parse match stats synchronously
        #match_stats_data_scrape += synchronous(match_stats_url_suffixes, scrape_match_stats, tourney_index, tourney_slug)

    else:
        spacing1 = format_spacing(5, tourney_index)
        spacing2 = format_spacing(15, tourney_slug)
        print tourney_index + spacing1 + '    ' + tourney_slug + spacing2 + '    Match stats URL problems'

    # STEP 3: Output to CSV
    filename = "match_stats_" + year + "_" + start_index
    array2csv(match_stats_data_scrape, filename)
예제 #5
0
        elif len(move_down_parsed) > 0:
            move_direction = 'down'
        else:
            move_direction = ''

        age = age_cleaned[i]
        points = int(points_parsed[i].replace(',', ''))
        tourneys = tourneys_parsed[i]

        try:
            name_text.encode('ascii')
        except UnicodeEncodeError:
            name_text = unidecode.unidecode(name_text)

        data = [
            week_title, week_year, week_month, week_day, rank_text,
            rank_number, move, move_direction, age, points, tourneys,
            player_url, player_slug, name_text, country_text, player_id
        ]
        rankings.append(data)

        filename = 'rankings_' + str(h) + '_' + week
        """" 
        with codecs.open(filename + '2.csv', 'w', encoding='utf8') as f:
            writer = csv.writer(f, delimiter=',')
            for row in rankings:
                writer.writerow(row)
        """
        array2csv(rankings, filename)

    print(str(h) + "        " + week)
    for i in xrange(0 , len(tourney_urls_scrape)):
        if len(tourney_urls_scrape[i]) > 0:
            # STEP 2: Scrape tournament page    
            match_data_scrape = []
            match_urls_scrape = []

            scrape_tourney_output = scrape_tourney(tourney_urls_scrape[i])
            match_data_scrape = scrape_tourney_output[0]
            match_urls_scrape = scrape_tourney_output[1]
     
            #match_counter += len(match_data_scrape)

            # STEP 3: tourney_data + match_data
            for match in match_data_scrape:
                foo = tourney_data_scrape[i] + match
                tourney_match.append(foo)

            spacing_count1 = len('Order') - len(str(tourney_data_scrape[i][1]))
            spacing1 = ''
            for j in xrange(0, spacing_count1): spacing1 += ' '

            spacing_count2 = 41 - len(tourney_data_scrape[i][2])
            spacing2 = ''
            for j in xrange(0, spacing_count2): spacing2 += ' '            

            print year + '    ' + str(tourney_data_scrape[i][1]) + spacing1 + '    ' + tourney_data_scrape[i][2] + spacing2 + ' ' + str(len(match_data_scrape))

        filename = "match_scores_" + start_year + "-" + end_year
        array2csv(tourney_match, filename)
        
            checkIfEmptyReturnFirst(first_name_cleaned),
            checkIfEmptyReturnFirst(last_name_cleaned), rank, player_url,
            profile_picture, flag_code,
            checkIfEmptyReturnFirst(residence_cleaned),
            checkIfEmptyReturnFirst(birthplace_cleaned),
            checkIfEmptyReturnFirst(birthdate_cleaned), birth_year,
            birth_month, birth_day,
            checkIfEmptyReturnFirst(turned_pro_cleaned),
            checkIfEmptyReturnFirst(weight_lbs_cleaned),
            weight_kg_cleaned_extracted, height_ft_cleaned_extracted,
            height_inches, height_cm_cleaned_extracted,
            handedness_cleaned_extracted, backhand,
            checkIfEmptyReturnFirst(coach_cleaned),
            checkIfEmptyReturnFirst(career_high_cleaned),
            career_high_date_cleaned_extracted,
            checkIfEmptyReturnFirst(prize_money_year_cleaned),
            checkIfEmptyReturnFirst(prize_money_cleaned),
            checkIfEmptyReturnFirst(titles_year_cleaned),
            checkIfEmptyReturnFirst(titles_cleaned),
            checkIfEmptyReturnFirst(win_loss_year_cleaned),
            checkIfEmptyReturnFirst(win_loss_cleaned)
        ])

array2csv(new_rows, 'rankings_0_2019-07-01')

# array2csv(profiles, 'profiles')
array2csv8utf(profiles, 'profiles')

end = time.time()
print(end - start)
import sys
from functions import tournaments, array2csv

# Command line input
start_year = str(sys.argv[1])
end_year = str(sys.argv[2])

# Iterate through the years and scrape tourney data

print ''
print 'Year    Tournaments'
print '----    -----------'

tourney_data = []
for h in xrange(int(start_year), int(end_year) + 1):
    year = str(h)
    tourney_data += tournaments(year)

# Output to CSV
filename = 'tournaments_' + start_year + '-' + end_year
array2csv(tourney_data, filename)
        week_split = week.split('-')
        week_year = int(week_split[0])
        week_month = int(week_split[1])
        week_day = int(week_split[2])

        week_title = week.replace('-','.')

        move = move_cleaned[i]        
        move_up_xpath = "//table[@class='mega-table']/tbody/tr[" + str(i + 1) + "]/td[@class='move-cell']/div[@class='move-up']"
        move_up_parsed = xpath_parse(week_tree, move_up_xpath)
        move_down_xpath = "//table[@class='mega-table']/tbody/tr[" + str(i + 1) + "]/td[@class='move-cell']/div[@class='move-down']"
        move_down_parsed = xpath_parse(week_tree, move_down_xpath)
        if len(move_up_parsed) > 0:
            move_direction = 'up'
        elif len(move_down_parsed) > 0:
            move_direction = 'down'
        else:
            move_direction = ''

        age = age_cleaned[i]
        points = int(points_parsed[i].replace(',', ''))
        tourneys = tourneys_parsed[i]

        data = [week_title, week_year, week_month, week_day, rank_text, rank_number, move, move_direction, age, points, tourneys, player_url, player_slug, player_id]
        rankings.append(data)
        
        filename = 'rankings_' + str(h) + '_' + week
        array2csv(rankings, filename)

    print str(h) + "        " + week
    for i in xrange(0 , len(tourney_urls_scrape)):
        if len(tourney_urls_scrape[i]) > 0:
            # STEP 2: Scrape tournament page    
            match_data_scrape = []
            match_urls_scrape = []

            scrape_tourney_output = scrape_tourney(tourney_urls_scrape[i])
            match_data_scrape = scrape_tourney_output[0]
            match_urls_scrape = scrape_tourney_output[1]
     
            #match_counter += len(match_data_scrape)

            # STEP 3: tourney_data + match_data
            for match in match_data_scrape:
                foo = tourney_data_scrape[i] + match
                tourney_match.append(foo)

            spacing_count1 = len('Order') - len(str(tourney_data_scrape[i][1]))
            spacing1 = ''
            for j in xrange(0, spacing_count1): spacing1 += ' '

            spacing_count2 = 41 - len(tourney_data_scrape[i][2])
            spacing2 = ''
            for j in xrange(0, spacing_count2): spacing2 += ' '            

            print year + '    ' + str(tourney_data_scrape[i][1]) + spacing1 + '    ' + tourney_data_scrape[i][2] + spacing2 + ' ' + str(len(match_data_scrape))

        filename = "match_scores_" + start_year + "-" + end_year
        array2csv(tourney_match, filename)