def main(): #Record what time we start, figure out how many pages we're going to scrape, #and set the pool to run with 8 workers. start = time.time() pageCount = getPageCount() pool = Pool(processes=8) #resultSet will be a list of lists with each sublist containing the output #from processPage resultSet = pool.map(processPage, range(1, pageCount + 1)) #This code just itereates through the result set and combines them ages = [] noAgeCount = 0 usersProcessed = 0 for entry in resultSet: ages += entry[0] noAgeCount += entry[1] usersProcessed += entry[2] #Figure out how long the script took to run end = time.time() diff = round((end - start) / 60, 1) #Save the age data and then print some basic info gen.listToCSV('CD-Ages-' + time.strftime('%Y-%m-%d'), ages) print(noAgeCount, 'out of', usersProcessed, 'had no age set. :(') print('Scraping run took', diff, 'minutes.')
def saveTeamList(year): fileExists, fullPath = filePathHandler('teams', None, 'teams', year) fileExists = False fileExists = False if not fileExists: try: teams = [] for page in range(0, 16): teams += tba.teams(page, year, False, True) gen.listToCSV(fullPath, teams) except Exception as e: print(e)
baseURLs = { 'facebook-profile': 'www.facebook.com/', 'twitter-profile': 'www.twitter.com/', 'youtube-profile': 'www.youtube.com/', 'github-profile': 'www.github.com/', 'instagram-profile': 'www.instagram.com/', 'periscope-profile': 'www.periscope.com/' } profileTypes = [ 'facebook-profile', 'twitter-profile', 'youtube-profile', 'github-profile', 'instagram-profile', 'periscope-profile' ] teamData = [] for team in tba.event_teams(event, False, True): teamMedia = tba.team_profiles(team) outString = team + ', ' for profile in profileTypes: foundMatch = False for prof in teamMedia: if prof['type'] == profile: outString += baseURLs[profile] + prof['foreign_key'] outString += ', ' teamData.append(outString) gen.listToCSV(event + 'socialData', teamData)
import gen tba = gen.setup() dcmp = '2019chcmp' gen.listToCSV(dcmp + ' Teams', sorted([int(team[3:]) for team in tba.event_teams(dcmp, keys = True)]))
import gen from tqdm import tqdm tba = gen.setup() #Set this to None for ALL teams year = 2019 teamList = [] for page in tqdm(range(0, 40)): currentTeams = tba.teams(page, year, False, True) if currentTeams == []: break else: teamList += currentTeams if year is not None: fileKey = str(year) + 'TeamKeys' else: fileKey = 'allTeamKeys' gen.listToCSV(fileKey, teamList)
import gen year = 2019 tba = gen.setup() teams = [] for dist in tba.districts(year): teams += [ int(team[3:]) for team in tba.district_teams(dist['key'], False, True) ] gen.listToCSV(str(year) + ' District Teams', teams)
def saveUpdateDate(updateDate): gen.listToCSV(baseFolder + str(year) + '/UpdateDate', [updateDate])