# Start with rosters CSV rosters_url = url_base + roster_path rosters_df = pd.read_csv(rosters_url, compression='gzip', low_memory=False) # Grab the teams table from rosters dataframe filter_col = [col for col in rosters_df if col.startswith('team.')] teams_df = rosters_df[filter_col] teams_df = teams_df.drop_duplicates() teams_df = teams_df.rename(columns=lambda x: x.replace('team.', '')) teams_csv_file = 'teams.csv' teams_pks = ['season', 'teamId'] repo.import_df('teams', teams_df, teams_pks, 'update') # Grab the players table from the rosters dataframe filter_col = [col for col in rosters_df if col.startswith('teamPlayers.')] filter_col = filter_col + ['team.season', 'team.teamId'] players_df = rosters_df[filter_col] players_df = players_df.rename(columns=lambda x: x.replace('team.', '')) players_df = players_df.rename(columns=lambda x: x.replace('teamPlayers.', '')) players_df['jerseyNumber'] = players_df['jerseyNumber'].fillna('') players_df['jerseyNumber'] = players_df['jerseyNumber'].astype(str) players_df['jerseyNumber'] = players_df['jerseyNumber'].str.split('.') players_df['jerseyNumber'] = players_df['jerseyNumber'].str[0] players_df['birthDate'] = pd.to_datetime(players_df['birthDate'])
# # Start main # cases = [] transcripts = [] i = 1 for file in glob.glob('supreme-court-cases/cases/*/*.js'): if (file == '.js'): continue case = {} import_case_file(file, case, transcripts) convert_dates(case, date_convert) cases.append(case) # # Import to Dolt # cases_df = pandas.DataFrame(cases) transcripts_df = pandas.DataFrame(transcripts) transcripts_pks = ['case_name', 'title', 'speaker', 'start'] repo = Dolt('./') repo.import_df('cases', cases_df, ['case_name'], import_mode='replace') repo.import_df('transcripts', transcripts_df, transcripts_pks, import_mode='replace')
else: data[col] = datetime.datetime.strptime(f'{month} {day}, {year}', '%b %d, %Y') # # Coerce the JSON file into a flat dictionary # with open("supreme-court-cases/justices.js") as file: justice_dict = json.load(file) justices = [] for justice_name in dict.keys(justice_dict): justice = justice_dict[justice_name] output = {} normalize(justice, column_map, output) convert_dates(output, date_cols) convert_dates_3col(output, date_3cols) justices.append(output) # # Import into Dolt # justices_df = pandas.DataFrame(justices) # Convert boolean columns using pandas justices_df[["ethnic"]] *= 1 repo = Dolt('./') repo.import_df('justices', justices_df, ['name'], import_mode='replace')
#!/usr/local/bin/python3 import pandas as pd from doltpy.core import Dolt from pprint import pprint url_base = 'https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/' repo = Dolt('.') pbp_df = pd.DataFrame() year = 2000 while year < 2020: url = url_base + 'play_by_play_' + str(year) + '.csv.gz?raw=True' i_data = pd.read_csv(url, compression='gzip', low_memory=False) pbp_df = pbp_df.append(i_data, sort=True) year += 1 #Give each row a unique index pbp_df.reset_index(drop=True, inplace=True) plays_pks = ['game_id', 'play_id'] plays_csv_file = 'plays.csv' repo.import_df('plays', pbp_df, plays_pks, 'update')